diff --git a/Documentation/ABI/testing/sysfs-class-cxl b/Documentation/ABI/testing/sysfs-class-cxl index 7fd737eed38a..4ba0a2a61926 100644 --- a/Documentation/ABI/testing/sysfs-class-cxl +++ b/Documentation/ABI/testing/sysfs-class-cxl @@ -233,3 +233,11 @@ Description: read/write 0 = don't trust, the image may be different (default) 1 = trust that the image will not change. Users: https://github.com/ibm-capi/libcxl + +What: /sys/class/cxl//psl_timebase_synced +Date: March 2016 +Contact: linuxppc-dev@lists.ozlabs.org +Description: read only + Returns 1 if the psl timebase register is synchronized + with the core timebase register, 0 otherwise. +Users: https://github.com/ibm-capi/libcxl diff --git a/Documentation/features/perf/perf-regs/arch-support.txt b/Documentation/features/perf/perf-regs/arch-support.txt index e2b4a78ec543..f179b1fb26ef 100644 --- a/Documentation/features/perf/perf-regs/arch-support.txt +++ b/Documentation/features/perf/perf-regs/arch-support.txt @@ -27,7 +27,7 @@ | nios2: | TODO | | openrisc: | TODO | | parisc: | TODO | - | powerpc: | TODO | + | powerpc: | ok | | s390: | TODO | | score: | TODO | | sh: | TODO | diff --git a/Documentation/features/perf/perf-stackdump/arch-support.txt b/Documentation/features/perf/perf-stackdump/arch-support.txt index 3dc24b0673c0..85777c5c6353 100644 --- a/Documentation/features/perf/perf-stackdump/arch-support.txt +++ b/Documentation/features/perf/perf-stackdump/arch-support.txt @@ -27,7 +27,7 @@ | nios2: | TODO | | openrisc: | TODO | | parisc: | TODO | - | powerpc: | TODO | + | powerpc: | ok | | s390: | TODO | | score: | TODO | | sh: | TODO | diff --git a/Documentation/powerpc/eeh-pci-error-recovery.txt b/Documentation/powerpc/eeh-pci-error-recovery.txt index 9d4e33df624c..678189280bb4 100644 --- a/Documentation/powerpc/eeh-pci-error-recovery.txt +++ b/Documentation/powerpc/eeh-pci-error-recovery.txt @@ -12,7 +12,7 @@ Overview: The IBM POWER-based pSeries and iSeries computers include PCI bus controller chips that have extended capabilities for detecting and reporting a large variety of PCI bus error conditions. These features -go under the name of "EEH", for "Extended Error Handling". The EEH +go under the name of "EEH", for "Enhanced Error Handling". The EEH hardware features allow PCI bus errors to be cleared and a PCI card to be "rebooted", without also having to reboot the operating system. diff --git a/MAINTAINERS b/MAINTAINERS index 65f3277a8cf0..2c670ba7b7db 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6675,6 +6675,19 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git S: Supported F: Documentation/powerpc/ F: arch/powerpc/ +F: drivers/char/tpm/tpm_ibmvtpm* +F: drivers/crypto/nx/ +F: drivers/crypto/vmx/ +F: drivers/net/ethernet/ibm/ibmveth.* +F: drivers/net/ethernet/ibm/ibmvnic.* +F: drivers/pci/hotplug/rpa* +F: drivers/scsi/ibmvscsi/ +N: opal +N: /pmac +N: powermac +N: powernv +N: [^a-z0-9]ps3 +N: pseries LINUX FOR POWER MACINTOSH M: Benjamin Herrenschmidt diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index a18a0dcd57b7..f0403b58ae8b 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -116,6 +116,8 @@ config PPC select GENERIC_ATOMIC64 if PPC32 select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select HAVE_PERF_EVENTS + select HAVE_PERF_REGS + select HAVE_PERF_USER_STACK_DUMP select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_HW_BREAKPOINT if PERF_EVENTS && PPC_BOOK3S_64 select ARCH_WANT_IPC_PARSE_VERSION @@ -606,9 +608,9 @@ endchoice config FORCE_MAX_ZONEORDER int "Maximum zone order" - range 9 64 if PPC64 && PPC_64K_PAGES + range 8 9 if PPC64 && PPC_64K_PAGES default "9" if PPC64 && PPC_64K_PAGES - range 13 64 if PPC64 && !PPC_64K_PAGES + range 9 13 if PPC64 && !PPC_64K_PAGES default "13" if PPC64 && !PPC_64K_PAGES range 9 64 if PPC32 && PPC_16K_PAGES default "9" if PPC32 && PPC_16K_PAGES @@ -795,7 +797,6 @@ config 4xx_SOC config FSL_LBC bool "Freescale Local Bus support" - depends on FSL_SOC help Enables reporting of errors from the Freescale local bus controller. Also contains some common code used by diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug index 638f9ce740f5..d3fcf7e64e3a 100644 --- a/arch/powerpc/Kconfig.debug +++ b/arch/powerpc/Kconfig.debug @@ -19,14 +19,6 @@ config PPC_WERROR depends on !PPC_DISABLE_WERROR default y -config STRICT_MM_TYPECHECKS - bool "Do extra type checking on mm types" - default n - help - This option turns on extra type checking for some mm related types. - - If you don't know what this means, say N. - config PRINT_STACK_DEPTH int "Stack depth to print" if DEBUG_KERNEL default 64 diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index 61165101342c..8fe78a3efc92 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -362,9 +362,6 @@ $(obj)/cuImage.initrd.%: vmlinux $(obj)/%.dtb $(wrapperbits) $(obj)/cuImage.%: vmlinux $(obj)/%.dtb $(wrapperbits) $(call if_changed,wrap,cuboot-$*,,$(obj)/$*.dtb) -$(obj)/cuImage.%: vmlinux $(obj)/fsl/%.dtb $(wrapperbits) - $(call if_changed,wrap,cuboot-$*,,$(obj)/fsl/$*.dtb) - $(obj)/simpleImage.initrd.%: vmlinux $(obj)/%.dtb $(wrapperbits) $(call if_changed,wrap,simpleboot-$*,,$(obj)/$*.dtb,$(obj)/ramdisk.image.gz) @@ -381,6 +378,9 @@ $(obj)/treeImage.%: vmlinux $(obj)/%.dtb $(wrapperbits) $(obj)/%.dtb: $(src)/dts/%.dts FORCE $(call if_changed_dep,dtc) +$(obj)/%.dtb: $(src)/dts/fsl/%.dts FORCE + $(call if_changed_dep,dtc) + # If there isn't a platform selected then just strip the vmlinux. ifeq (,$(image-y)) image-y := vmlinux.strip diff --git a/arch/powerpc/boot/dts/fsl/gef_ppc9a.dts b/arch/powerpc/boot/dts/fsl/gef_ppc9a.dts index 0424fc2bd0e0..c88d4ef9e4f7 100644 --- a/arch/powerpc/boot/dts/fsl/gef_ppc9a.dts +++ b/arch/powerpc/boot/dts/fsl/gef_ppc9a.dts @@ -211,6 +211,10 @@ pcie@0 { 0x0 0x00400000>; }; }; + + pci1: pcie@fef09000 { + status = "disabled"; + }; }; /include/ "mpc8641si-post.dtsi" diff --git a/arch/powerpc/boot/dts/fsl/gef_sbc310.dts b/arch/powerpc/boot/dts/fsl/gef_sbc310.dts index 84b3d38f880e..838515798cce 100644 --- a/arch/powerpc/boot/dts/fsl/gef_sbc310.dts +++ b/arch/powerpc/boot/dts/fsl/gef_sbc310.dts @@ -24,10 +24,6 @@ / { model = "GEF_SBC310"; compatible = "gef,sbc310"; - aliases { - pci1 = &pci1; - }; - memory { device_type = "memory"; reg = <0x0 0x40000000>; // set by uboot @@ -223,29 +219,11 @@ pcie@0 { }; pci1: pcie@fef09000 { - compatible = "fsl,mpc8641-pcie"; - device_type = "pci"; - #size-cells = <2>; - #address-cells = <3>; reg = <0xfef09000 0x1000>; - bus-range = <0x0 0xff>; ranges = <0x02000000 0x0 0xc0000000 0xc0000000 0x0 0x20000000 0x01000000 0x0 0x00000000 0xfe400000 0x0 0x00400000>; - clock-frequency = <100000000>; - interrupts = <0x19 0x2 0 0>; - interrupt-map-mask = <0xf800 0x0 0x0 0x7>; - interrupt-map = < - 0x0000 0x0 0x0 0x1 &mpic 0x4 0x2 - 0x0000 0x0 0x0 0x2 &mpic 0x5 0x2 - 0x0000 0x0 0x0 0x3 &mpic 0x6 0x2 - 0x0000 0x0 0x0 0x4 &mpic 0x7 0x2 - >; pcie@0 { - reg = <0 0 0 0 0>; - #size-cells = <2>; - #address-cells = <3>; - device_type = "pci"; ranges = <0x02000000 0x0 0xc0000000 0x02000000 0x0 0xc0000000 0x0 0x20000000 diff --git a/arch/powerpc/boot/dts/fsl/gef_sbc610.dts b/arch/powerpc/boot/dts/fsl/gef_sbc610.dts index 974446acce23..ff423ab424f2 100644 --- a/arch/powerpc/boot/dts/fsl/gef_sbc610.dts +++ b/arch/powerpc/boot/dts/fsl/gef_sbc610.dts @@ -209,6 +209,10 @@ pcie@0 { 0x0 0x00400000>; }; }; + + pci1: pcie@fef09000 { + status = "disabled"; + }; }; /include/ "mpc8641si-post.dtsi" diff --git a/arch/powerpc/boot/dts/fsl/mpc8641_hpcn.dts b/arch/powerpc/boot/dts/fsl/mpc8641_hpcn.dts index 554001f2e96a..11bea3e6a43f 100644 --- a/arch/powerpc/boot/dts/fsl/mpc8641_hpcn.dts +++ b/arch/powerpc/boot/dts/fsl/mpc8641_hpcn.dts @@ -15,10 +15,6 @@ / { model = "MPC8641HPCN"; compatible = "fsl,mpc8641hpcn"; - aliases { - pci1 = &pci1; - }; - memory { device_type = "memory"; reg = <0x00000000 0x40000000>; // 1G at 0x0 @@ -359,29 +355,11 @@ gpio@400 { }; pci1: pcie@ffe09000 { - compatible = "fsl,mpc8641-pcie"; - device_type = "pci"; - #size-cells = <2>; - #address-cells = <3>; reg = <0xffe09000 0x1000>; - bus-range = <0 0xff>; ranges = <0x02000000 0x0 0xa0000000 0xa0000000 0x0 0x20000000 0x01000000 0x0 0x00000000 0xffc10000 0x0 0x00010000>; - clock-frequency = <100000000>; - interrupts = <25 2 0 0>; - interrupt-map-mask = <0xf800 0 0 7>; - interrupt-map = < - /* IDSEL 0x0 */ - 0x0000 0 0 1 &mpic 4 1 - 0x0000 0 0 2 &mpic 5 1 - 0x0000 0 0 3 &mpic 6 1 - 0x0000 0 0 4 &mpic 7 1 - >; + pcie@0 { - reg = <0 0 0 0 0>; - #size-cells = <2>; - #address-cells = <3>; - device_type = "pci"; ranges = <0x02000000 0x0 0xa0000000 0x02000000 0x0 0xa0000000 0x0 0x20000000 diff --git a/arch/powerpc/boot/dts/fsl/mpc8641_hpcn_36b.dts b/arch/powerpc/boot/dts/fsl/mpc8641_hpcn_36b.dts index fec58671a6d6..7ff62046a9ea 100644 --- a/arch/powerpc/boot/dts/fsl/mpc8641_hpcn_36b.dts +++ b/arch/powerpc/boot/dts/fsl/mpc8641_hpcn_36b.dts @@ -17,10 +17,6 @@ / { #address-cells = <2>; #size-cells = <2>; - aliases { - pci1 = &pci1; - }; - memory { device_type = "memory"; reg = <0x0 0x00000000 0x0 0x40000000>; // 1G at 0x0 @@ -326,29 +322,11 @@ gpio@400 { }; pci1: pcie@fffe09000 { - compatible = "fsl,mpc8641-pcie"; - device_type = "pci"; - #size-cells = <2>; - #address-cells = <3>; reg = <0x0f 0xffe09000 0x0 0x1000>; - bus-range = <0x0 0xff>; ranges = <0x02000000 0x0 0xe0000000 0x0c 0x20000000 0x0 0x20000000 0x01000000 0x0 0x00000000 0x0f 0xffc10000 0x0 0x00010000>; - clock-frequency = <100000000>; - interrupts = <25 2 0 0>; - interrupt-map-mask = <0xf800 0 0 7>; - interrupt-map = < - /* IDSEL 0x0 */ - 0x0000 0 0 1 &mpic 4 1 - 0x0000 0 0 2 &mpic 5 1 - 0x0000 0 0 3 &mpic 6 1 - 0x0000 0 0 4 &mpic 7 1 - >; + pcie@0 { - reg = <0 0 0 0 0>; - #size-cells = <2>; - #address-cells = <3>; - device_type = "pci"; ranges = <0x02000000 0x0 0xe0000000 0x02000000 0x0 0xe0000000 0x0 0x20000000 diff --git a/arch/powerpc/boot/dts/fsl/mpc8641si-post.dtsi b/arch/powerpc/boot/dts/fsl/mpc8641si-post.dtsi index 70889d8e8850..eeb7c65d5f22 100644 --- a/arch/powerpc/boot/dts/fsl/mpc8641si-post.dtsi +++ b/arch/powerpc/boot/dts/fsl/mpc8641si-post.dtsi @@ -102,19 +102,46 @@ &pci0 { bus-range = <0x0 0xff>; clock-frequency = <100000000>; interrupts = <24 2 0 0>; - interrupt-map-mask = <0xf800 0x0 0x0 0x7>; - - interrupt-map = < - 0x0000 0x0 0x0 0x1 &mpic 0x0 0x1 - 0x0000 0x0 0x0 0x2 &mpic 0x1 0x1 - 0x0000 0x0 0x0 0x3 &mpic 0x2 0x1 - 0x0000 0x0 0x0 0x4 &mpic 0x3 0x1 - >; pcie@0 { reg = <0 0 0 0 0>; + #interrupt-cells = <1>; #size-cells = <2>; #address-cells = <3>; device_type = "pci"; + interrupts = <24 2 0 0>; + interrupt-map-mask = <0xf800 0x0 0x0 0x7>; + interrupt-map = < + 0x0000 0x0 0x0 0x1 &mpic 0x0 0x1 0x0 0x0 + 0x0000 0x0 0x0 0x2 &mpic 0x1 0x1 0x0 0x0 + 0x0000 0x0 0x0 0x3 &mpic 0x2 0x1 0x0 0x0 + 0x0000 0x0 0x0 0x4 &mpic 0x3 0x1 0x0 0x0 + >; + }; +}; + +&pci1 { + compatible = "fsl,mpc8641-pcie"; + device_type = "pci"; + #size-cells = <2>; + #address-cells = <3>; + bus-range = <0x0 0xff>; + clock-frequency = <100000000>; + interrupts = <25 2 0 0>; + + pcie@0 { + reg = <0 0 0 0 0>; + #interrupt-cells = <1>; + #size-cells = <2>; + #address-cells = <3>; + device_type = "pci"; + interrupts = <25 2 0 0>; + interrupt-map-mask = <0xf800 0x0 0x0 0x7>; + interrupt-map = < + 0x0000 0x0 0x0 0x1 &mpic 0x4 0x1 0x0 0x0 + 0x0000 0x0 0x0 0x2 &mpic 0x5 0x1 0x0 0x0 + 0x0000 0x0 0x0 0x3 &mpic 0x6 0x1 0x0 0x0 + 0x0000 0x0 0x0 0x4 &mpic 0x7 0x1 0x0 0x0 + >; }; }; diff --git a/arch/powerpc/boot/dts/fsl/mpc8641si-pre.dtsi b/arch/powerpc/boot/dts/fsl/mpc8641si-pre.dtsi index 9e03328561d3..7c6db6f7c12e 100644 --- a/arch/powerpc/boot/dts/fsl/mpc8641si-pre.dtsi +++ b/arch/powerpc/boot/dts/fsl/mpc8641si-pre.dtsi @@ -25,6 +25,7 @@ aliases { serial0 = &serial0; serial1 = &serial1; pci0 = &pci0; + pci1 = &pci1; }; cpus { diff --git a/arch/powerpc/boot/dts/fsl/sbc8641d.dts b/arch/powerpc/boot/dts/fsl/sbc8641d.dts index 0a9733cd418d..75870a124903 100644 --- a/arch/powerpc/boot/dts/fsl/sbc8641d.dts +++ b/arch/powerpc/boot/dts/fsl/sbc8641d.dts @@ -19,10 +19,6 @@ / { model = "SBC8641D"; compatible = "wind,sbc8641"; - aliases { - pci1 = &pci1; - }; - memory { device_type = "memory"; reg = <0x00000000 0x20000000>; // 512M at 0x0 @@ -165,30 +161,11 @@ pcie@0 { }; pci1: pcie@f8009000 { - compatible = "fsl,mpc8641-pcie"; - device_type = "pci"; - #size-cells = <2>; - #address-cells = <3>; reg = <0xf8009000 0x1000>; - bus-range = <0 0xff>; ranges = <0x02000000 0x0 0xa0000000 0xa0000000 0x0 0x20000000 0x01000000 0x0 0x00000000 0xe3000000 0x0 0x00100000>; - clock-frequency = <100000000>; - interrupts = <25 2 0 0>; - interrupt-map-mask = <0xf800 0 0 7>; - interrupt-map = < - /* IDSEL 0x0 */ - 0x0000 0 0 1 &mpic 4 1 - 0x0000 0 0 2 &mpic 5 1 - 0x0000 0 0 3 &mpic 6 1 - 0x0000 0 0 4 &mpic 7 1 - >; pcie@0 { - reg = <0 0 0 0 0>; - #size-cells = <2>; - #address-cells = <3>; - device_type = "pci"; ranges = <0x02000000 0x0 0xa0000000 0x02000000 0x0 0xa0000000 0x0 0x20000000 diff --git a/arch/powerpc/boot/dts/fsl/t1023si-post.dtsi b/arch/powerpc/boot/dts/fsl/t1023si-post.dtsi index 99e421df79d4..6e0b4892a740 100644 --- a/arch/powerpc/boot/dts/fsl/t1023si-post.dtsi +++ b/arch/powerpc/boot/dts/fsl/t1023si-post.dtsi @@ -263,7 +263,7 @@ mux1: mux1@20 { }; rcpm: global-utilities@e2000 { - compatible = "fsl,t1023-rcpm", "fsl,qoriq-rcpm-2.0"; + compatible = "fsl,t1023-rcpm", "fsl,qoriq-rcpm-2.1"; reg = <0xe2000 0x1000>; }; diff --git a/arch/powerpc/boot/dts/fsl/t1040si-post.dtsi b/arch/powerpc/boot/dts/fsl/t1040si-post.dtsi index e0f4da554774..507649ece0a1 100644 --- a/arch/powerpc/boot/dts/fsl/t1040si-post.dtsi +++ b/arch/powerpc/boot/dts/fsl/t1040si-post.dtsi @@ -472,7 +472,7 @@ mux3: mux3@60 { }; rcpm: global-utilities@e2000 { - compatible = "fsl,t1040-rcpm", "fsl,qoriq-rcpm-2.0"; + compatible = "fsl,t1040-rcpm", "fsl,qoriq-rcpm-2.1"; reg = <0xe2000 0x1000>; }; diff --git a/arch/powerpc/boot/dts/fsl/t104xrdb.dtsi b/arch/powerpc/boot/dts/fsl/t104xrdb.dtsi index 72691ef102ee..7c4afdb44b46 100644 --- a/arch/powerpc/boot/dts/fsl/t104xrdb.dtsi +++ b/arch/powerpc/boot/dts/fsl/t104xrdb.dtsi @@ -109,7 +109,7 @@ spi@110000 { flash@0 { #address-cells = <1>; #size-cells = <1>; - compatible = "micron,n25q512a", "jedec,spi-nor"; + compatible = "micron,n25q512ax3", "jedec,spi-nor"; reg = <0>; spi-max-frequency = <10000000>; /* input clock */ }; diff --git a/arch/powerpc/boot/dts/fsl/t208xrdb.dtsi b/arch/powerpc/boot/dts/fsl/t208xrdb.dtsi index dc9326875778..ff87e67c70da 100644 --- a/arch/powerpc/boot/dts/fsl/t208xrdb.dtsi +++ b/arch/powerpc/boot/dts/fsl/t208xrdb.dtsi @@ -113,7 +113,7 @@ spi@110000 { flash@0 { #address-cells = <1>; #size-cells = <1>; - compatible = "micron,n25q512a", "jedec,spi-nor"; + compatible = "micron,n25q512ax3", "jedec,spi-nor"; reg = <0>; spi-max-frequency = <10000000>; /* input clock */ }; diff --git a/arch/powerpc/include/asm/book3s/32/hash.h b/arch/powerpc/include/asm/book3s/32/hash.h index 264b754d65b0..880db13a2e9f 100644 --- a/arch/powerpc/include/asm/book3s/32/hash.h +++ b/arch/powerpc/include/asm/book3s/32/hash.h @@ -39,8 +39,5 @@ #define _PMD_PRESENT_MASK (PAGE_MASK) #define _PMD_BAD (~PAGE_MASK) -/* Hash table based platforms need atomic updates of the linux PTE */ -#define PTE_ATOMIC_UPDATES 1 - #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_BOOK3S_32_HASH_H */ diff --git a/arch/powerpc/include/asm/book3s/32/mmu-hash.h b/arch/powerpc/include/asm/book3s/32/mmu-hash.h index 16f513e5cbd7..b82e063494dd 100644 --- a/arch/powerpc/include/asm/book3s/32/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/32/mmu-hash.h @@ -1,5 +1,5 @@ -#ifndef _ASM_POWERPC_MMU_HASH32_H_ -#define _ASM_POWERPC_MMU_HASH32_H_ +#ifndef _ASM_POWERPC_BOOK3S_32_MMU_HASH_H_ +#define _ASM_POWERPC_BOOK3S_32_MMU_HASH_H_ /* * 32-bit hash table MMU support */ @@ -90,4 +90,4 @@ typedef struct { #define mmu_virtual_psize MMU_PAGE_4K #define mmu_linear_psize MMU_PAGE_256M -#endif /* _ASM_POWERPC_MMU_HASH32_H_ */ +#endif /* _ASM_POWERPC_BOOK3S_32_MMU_HASH_H_ */ diff --git a/arch/powerpc/include/asm/book3s/32/pgalloc.h b/arch/powerpc/include/asm/book3s/32/pgalloc.h new file mode 100644 index 000000000000..a2350194fc76 --- /dev/null +++ b/arch/powerpc/include/asm/book3s/32/pgalloc.h @@ -0,0 +1,109 @@ +#ifndef _ASM_POWERPC_BOOK3S_32_PGALLOC_H +#define _ASM_POWERPC_BOOK3S_32_PGALLOC_H + +#include + +/* For 32-bit, all levels of page tables are just drawn from get_free_page() */ +#define MAX_PGTABLE_INDEX_SIZE 0 + +extern void __bad_pte(pmd_t *pmd); + +extern pgd_t *pgd_alloc(struct mm_struct *mm); +extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); + +/* + * We don't have any real pmd's, and this code never triggers because + * the pgd will always be present.. + */ +/* #define pmd_alloc_one(mm,address) ({ BUG(); ((pmd_t *)2); }) */ +#define pmd_free(mm, x) do { } while (0) +#define __pmd_free_tlb(tlb,x,a) do { } while (0) +/* #define pgd_populate(mm, pmd, pte) BUG() */ + +#ifndef CONFIG_BOOKE + +static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp, + pte_t *pte) +{ + *pmdp = __pmd(__pa(pte) | _PMD_PRESENT); +} + +static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pte_page) +{ + *pmdp = __pmd((page_to_pfn(pte_page) << PAGE_SHIFT) | _PMD_PRESENT); +} + +#define pmd_pgtable(pmd) pmd_page(pmd) +#else + +static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp, + pte_t *pte) +{ + *pmdp = __pmd((unsigned long)pte | _PMD_PRESENT); +} + +static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pte_page) +{ + *pmdp = __pmd((unsigned long)lowmem_page_address(pte_page) | _PMD_PRESENT); +} + +#define pmd_pgtable(pmd) pmd_page(pmd) +#endif + +extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr); +extern pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addr); + +static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) +{ + free_page((unsigned long)pte); +} + +static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage) +{ + pgtable_page_dtor(ptepage); + __free_page(ptepage); +} + +static inline void pgtable_free(void *table, unsigned index_size) +{ + BUG_ON(index_size); /* 32-bit doesn't use this */ + free_page((unsigned long)table); +} + +#define check_pgt_cache() do { } while (0) + +#ifdef CONFIG_SMP +static inline void pgtable_free_tlb(struct mmu_gather *tlb, + void *table, int shift) +{ + unsigned long pgf = (unsigned long)table; + BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); + pgf |= shift; + tlb_remove_table(tlb, (void *)pgf); +} + +static inline void __tlb_remove_table(void *_table) +{ + void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE); + unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE; + + pgtable_free(table, shift); +} +#else +static inline void pgtable_free_tlb(struct mmu_gather *tlb, + void *table, int shift) +{ + pgtable_free(table, shift); +} +#endif + +static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table, + unsigned long address) +{ + tlb_flush_pgtable(tlb, address); + pgtable_page_dtor(table); + pgtable_free_tlb(tlb, page_address(table), 0); +} +#endif /* _ASM_POWERPC_BOOK3S_32_PGALLOC_H */ diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h index 5f08a0832238..1af837c561ba 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-4k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h @@ -5,58 +5,31 @@ * for each page table entry. The PMD and PGD level use a 32b record for * each entry by assuming that each entry is page aligned. */ -#define PTE_INDEX_SIZE 9 -#define PMD_INDEX_SIZE 7 -#define PUD_INDEX_SIZE 9 -#define PGD_INDEX_SIZE 9 +#define H_PTE_INDEX_SIZE 9 +#define H_PMD_INDEX_SIZE 7 +#define H_PUD_INDEX_SIZE 9 +#define H_PGD_INDEX_SIZE 9 #ifndef __ASSEMBLY__ -#define PTE_TABLE_SIZE (sizeof(pte_t) << PTE_INDEX_SIZE) -#define PMD_TABLE_SIZE (sizeof(pmd_t) << PMD_INDEX_SIZE) -#define PUD_TABLE_SIZE (sizeof(pud_t) << PUD_INDEX_SIZE) -#define PGD_TABLE_SIZE (sizeof(pgd_t) << PGD_INDEX_SIZE) -#endif /* __ASSEMBLY__ */ - -#define PTRS_PER_PTE (1 << PTE_INDEX_SIZE) -#define PTRS_PER_PMD (1 << PMD_INDEX_SIZE) -#define PTRS_PER_PUD (1 << PUD_INDEX_SIZE) -#define PTRS_PER_PGD (1 << PGD_INDEX_SIZE) - -/* PMD_SHIFT determines what a second-level page table entry can map */ -#define PMD_SHIFT (PAGE_SHIFT + PTE_INDEX_SIZE) -#define PMD_SIZE (1UL << PMD_SHIFT) -#define PMD_MASK (~(PMD_SIZE-1)) +#define H_PTE_TABLE_SIZE (sizeof(pte_t) << H_PTE_INDEX_SIZE) +#define H_PMD_TABLE_SIZE (sizeof(pmd_t) << H_PMD_INDEX_SIZE) +#define H_PUD_TABLE_SIZE (sizeof(pud_t) << H_PUD_INDEX_SIZE) +#define H_PGD_TABLE_SIZE (sizeof(pgd_t) << H_PGD_INDEX_SIZE) /* With 4k base page size, hugepage PTEs go at the PMD level */ #define MIN_HUGEPTE_SHIFT PMD_SHIFT -/* PUD_SHIFT determines what a third-level page table entry can map */ -#define PUD_SHIFT (PMD_SHIFT + PMD_INDEX_SIZE) -#define PUD_SIZE (1UL << PUD_SHIFT) -#define PUD_MASK (~(PUD_SIZE-1)) - -/* PGDIR_SHIFT determines what a fourth-level page table entry can map */ -#define PGDIR_SHIFT (PUD_SHIFT + PUD_INDEX_SIZE) -#define PGDIR_SIZE (1UL << PGDIR_SHIFT) -#define PGDIR_MASK (~(PGDIR_SIZE-1)) - -/* Bits to mask out from a PMD to get to the PTE page */ -#define PMD_MASKED_BITS 0 -/* Bits to mask out from a PUD to get to the PMD page */ -#define PUD_MASKED_BITS 0 -/* Bits to mask out from a PGD to get to the PUD page */ -#define PGD_MASKED_BITS 0 - /* PTE flags to conserve for HPTE identification */ -#define _PAGE_HPTEFLAGS (_PAGE_BUSY | _PAGE_HASHPTE | \ - _PAGE_F_SECOND | _PAGE_F_GIX) - -/* shift to put page number into pte */ -#define PTE_RPN_SHIFT (12) -#define PTE_RPN_SIZE (45) /* gives 57-bit real addresses */ - -#define _PAGE_4K_PFN 0 -#ifndef __ASSEMBLY__ +#define _PAGE_HPTEFLAGS (H_PAGE_BUSY | H_PAGE_HASHPTE | \ + H_PAGE_F_SECOND | H_PAGE_F_GIX) +/* + * Not supported by 4k linux page size + */ +#define H_PAGE_4K_PFN 0x0 +#define H_PAGE_THP_HUGE 0x0 +#define H_PAGE_COMBO 0x0 +#define H_PTE_FRAG_NR 0 +#define H_PTE_FRAG_SIZE_SHIFT 0 /* * On all 4K setups, remap_4k_pfn() equates to remap_pfn_range() */ @@ -64,26 +37,7 @@ remap_pfn_range((vma), (addr), (pfn), PAGE_SIZE, (prot)) #ifdef CONFIG_HUGETLB_PAGE -/* - * For 4k page size, we support explicit hugepage via hugepd - */ -static inline int pmd_huge(pmd_t pmd) -{ - return 0; -} - -static inline int pud_huge(pud_t pud) -{ - return 0; -} - -static inline int pgd_huge(pgd_t pgd) -{ - return 0; -} -#define pgd_huge pgd_huge - -static inline int hugepd_ok(hugepd_t hpd) +static inline int hash__hugepd_ok(hugepd_t hpd) { /* * if it is not a pte and have hugepd shift mask @@ -94,7 +48,65 @@ static inline int hugepd_ok(hugepd_t hpd) return true; return false; } -#define is_hugepd(hpd) (hugepd_ok(hpd)) +#endif + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + +static inline char *get_hpte_slot_array(pmd_t *pmdp) +{ + BUG(); + return NULL; +} + +static inline unsigned int hpte_valid(unsigned char *hpte_slot_array, int index) +{ + BUG(); + return 0; +} + +static inline unsigned int hpte_hash_index(unsigned char *hpte_slot_array, + int index) +{ + BUG(); + return 0; +} + +static inline void mark_hpte_slot_valid(unsigned char *hpte_slot_array, + unsigned int index, unsigned int hidx) +{ + BUG(); +} + +static inline int hash__pmd_trans_huge(pmd_t pmd) +{ + return 0; +} + +static inline int hash__pmd_same(pmd_t pmd_a, pmd_t pmd_b) +{ + BUG(); + return 0; +} + +static inline pmd_t hash__pmd_mkhuge(pmd_t pmd) +{ + BUG(); + return pmd; +} + +extern unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp, + unsigned long clr, unsigned long set); +extern pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp); +extern void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable); +extern pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); +extern void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp); +extern pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp); +extern int hash__has_transparent_hugepage(void); #endif #endif /* !__ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h index 0a7956a80a08..5aae4f530c21 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-64k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h @@ -1,73 +1,44 @@ #ifndef _ASM_POWERPC_BOOK3S_64_HASH_64K_H #define _ASM_POWERPC_BOOK3S_64_HASH_64K_H -#define PTE_INDEX_SIZE 8 -#define PMD_INDEX_SIZE 5 -#define PUD_INDEX_SIZE 5 -#define PGD_INDEX_SIZE 12 - -#define PTRS_PER_PTE (1 << PTE_INDEX_SIZE) -#define PTRS_PER_PMD (1 << PMD_INDEX_SIZE) -#define PTRS_PER_PUD (1 << PUD_INDEX_SIZE) -#define PTRS_PER_PGD (1 << PGD_INDEX_SIZE) +#define H_PTE_INDEX_SIZE 8 +#define H_PMD_INDEX_SIZE 5 +#define H_PUD_INDEX_SIZE 5 +#define H_PGD_INDEX_SIZE 12 /* With 4k base page size, hugepage PTEs go at the PMD level */ #define MIN_HUGEPTE_SHIFT PAGE_SHIFT -/* PMD_SHIFT determines what a second-level page table entry can map */ -#define PMD_SHIFT (PAGE_SHIFT + PTE_INDEX_SIZE) -#define PMD_SIZE (1UL << PMD_SHIFT) -#define PMD_MASK (~(PMD_SIZE-1)) - -/* PUD_SHIFT determines what a third-level page table entry can map */ -#define PUD_SHIFT (PMD_SHIFT + PMD_INDEX_SIZE) -#define PUD_SIZE (1UL << PUD_SHIFT) -#define PUD_MASK (~(PUD_SIZE-1)) - -/* PGDIR_SHIFT determines what a fourth-level page table entry can map */ -#define PGDIR_SHIFT (PUD_SHIFT + PUD_INDEX_SIZE) -#define PGDIR_SIZE (1UL << PGDIR_SHIFT) -#define PGDIR_MASK (~(PGDIR_SIZE-1)) - -#define _PAGE_COMBO 0x00001000 /* this is a combo 4k page */ -#define _PAGE_4K_PFN 0x00002000 /* PFN is for a single 4k page */ +#define H_PAGE_COMBO 0x00001000 /* this is a combo 4k page */ +#define H_PAGE_4K_PFN 0x00002000 /* PFN is for a single 4k page */ /* - * Used to track subpage group valid if _PAGE_COMBO is set - * This overloads _PAGE_F_GIX and _PAGE_F_SECOND + * We need to differentiate between explicit huge page and THP huge + * page, since THP huge page also need to track real subpage details */ -#define _PAGE_COMBO_VALID (_PAGE_F_GIX | _PAGE_F_SECOND) +#define H_PAGE_THP_HUGE H_PAGE_4K_PFN + +/* + * Used to track subpage group valid if H_PAGE_COMBO is set + * This overloads H_PAGE_F_GIX and H_PAGE_F_SECOND + */ +#define H_PAGE_COMBO_VALID (H_PAGE_F_GIX | H_PAGE_F_SECOND) /* PTE flags to conserve for HPTE identification */ -#define _PAGE_HPTEFLAGS (_PAGE_BUSY | _PAGE_F_SECOND | \ - _PAGE_F_GIX | _PAGE_HASHPTE | _PAGE_COMBO) - -/* Shift to put page number into pte. - * - * That gives us a max RPN of 41 bits, which means a max of 57 bits - * of addressable physical space, or 53 bits for the special 4k PFNs. - */ -#define PTE_RPN_SHIFT (16) -#define PTE_RPN_SIZE (41) - +#define _PAGE_HPTEFLAGS (H_PAGE_BUSY | H_PAGE_F_SECOND | \ + H_PAGE_F_GIX | H_PAGE_HASHPTE | H_PAGE_COMBO) /* * we support 16 fragments per PTE page of 64K size. */ -#define PTE_FRAG_NR 16 +#define H_PTE_FRAG_NR 16 /* * We use a 2K PTE page fragment and another 2K for storing * real_pte_t hash index */ -#define PTE_FRAG_SIZE_SHIFT 12 +#define H_PTE_FRAG_SIZE_SHIFT 12 #define PTE_FRAG_SIZE (1UL << PTE_FRAG_SIZE_SHIFT) -/* Bits to mask out from a PMD to get to the PTE page */ -#define PMD_MASKED_BITS 0xc0000000000000ffUL -/* Bits to mask out from a PUD to get to the PMD page */ -#define PUD_MASKED_BITS 0xc0000000000000ffUL -/* Bits to mask out from a PGD to get to the PUD page */ -#define PGD_MASKED_BITS 0xc0000000000000ffUL - #ifndef __ASSEMBLY__ +#include /* * With 64K pages on hash table, we have a special PTE format that @@ -83,9 +54,9 @@ static inline real_pte_t __real_pte(pte_t pte, pte_t *ptep) rpte.pte = pte; rpte.hidx = 0; - if (pte_val(pte) & _PAGE_COMBO) { + if (pte_val(pte) & H_PAGE_COMBO) { /* - * Make sure we order the hidx load against the _PAGE_COMBO + * Make sure we order the hidx load against the H_PAGE_COMBO * check. The store side ordering is done in __hash_page_4K */ smp_rmb(); @@ -97,9 +68,9 @@ static inline real_pte_t __real_pte(pte_t pte, pte_t *ptep) static inline unsigned long __rpte_to_hidx(real_pte_t rpte, unsigned long index) { - if ((pte_val(rpte.pte) & _PAGE_COMBO)) + if ((pte_val(rpte.pte) & H_PAGE_COMBO)) return (rpte.hidx >> (index<<2)) & 0xf; - return (pte_val(rpte.pte) >> _PAGE_F_GIX_SHIFT) & 0xf; + return (pte_val(rpte.pte) >> H_PAGE_F_GIX_SHIFT) & 0xf; } #define __rpte_to_pte(r) ((r).pte) @@ -122,79 +93,32 @@ extern bool __rpte_sub_valid(real_pte_t rpte, unsigned long index); #define pte_iterate_hashed_end() } while(0); } } while(0) #define pte_pagesize_index(mm, addr, pte) \ - (((pte) & _PAGE_COMBO)? MMU_PAGE_4K: MMU_PAGE_64K) + (((pte) & H_PAGE_COMBO)? MMU_PAGE_4K: MMU_PAGE_64K) -#define remap_4k_pfn(vma, addr, pfn, prot) \ - (WARN_ON(((pfn) >= (1UL << PTE_RPN_SIZE))) ? -EINVAL : \ - remap_pfn_range((vma), (addr), (pfn), PAGE_SIZE, \ - __pgprot(pgprot_val((prot)) | _PAGE_4K_PFN))) +extern int remap_pfn_range(struct vm_area_struct *, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t); +static inline int hash__remap_4k_pfn(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, pgprot_t prot) +{ + if (pfn > (PTE_RPN_MASK >> PAGE_SHIFT)) { + WARN(1, "remap_4k_pfn called with wrong pfn value\n"); + return -EINVAL; + } + return remap_pfn_range(vma, addr, pfn, PAGE_SIZE, + __pgprot(pgprot_val(prot) | H_PAGE_4K_PFN)); +} -#define PTE_TABLE_SIZE PTE_FRAG_SIZE +#define H_PTE_TABLE_SIZE PTE_FRAG_SIZE #ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define PMD_TABLE_SIZE ((sizeof(pmd_t) << PMD_INDEX_SIZE) + (sizeof(unsigned long) << PMD_INDEX_SIZE)) +#define H_PMD_TABLE_SIZE ((sizeof(pmd_t) << PMD_INDEX_SIZE) + \ + (sizeof(unsigned long) << PMD_INDEX_SIZE)) #else -#define PMD_TABLE_SIZE (sizeof(pmd_t) << PMD_INDEX_SIZE) +#define H_PMD_TABLE_SIZE (sizeof(pmd_t) << PMD_INDEX_SIZE) #endif -#define PUD_TABLE_SIZE (sizeof(pud_t) << PUD_INDEX_SIZE) -#define PGD_TABLE_SIZE (sizeof(pgd_t) << PGD_INDEX_SIZE) - -#ifdef CONFIG_HUGETLB_PAGE -/* - * We have PGD_INDEX_SIZ = 12 and PTE_INDEX_SIZE = 8, so that we can have - * 16GB hugepage pte in PGD and 16MB hugepage pte at PMD; - * - * Defined in such a way that we can optimize away code block at build time - * if CONFIG_HUGETLB_PAGE=n. - */ -static inline int pmd_huge(pmd_t pmd) -{ - /* - * leaf pte for huge page - */ - return !!(pmd_val(pmd) & _PAGE_PTE); -} - -static inline int pud_huge(pud_t pud) -{ - /* - * leaf pte for huge page - */ - return !!(pud_val(pud) & _PAGE_PTE); -} - -static inline int pgd_huge(pgd_t pgd) -{ - /* - * leaf pte for huge page - */ - return !!(pgd_val(pgd) & _PAGE_PTE); -} -#define pgd_huge pgd_huge - -#ifdef CONFIG_DEBUG_VM -extern int hugepd_ok(hugepd_t hpd); -#define is_hugepd(hpd) (hugepd_ok(hpd)) -#else -/* - * With 64k page size, we have hugepage ptes in the pgd and pmd entries. We don't - * need to setup hugepage directory for them. Our pte and page directory format - * enable us to have this enabled. - */ -static inline int hugepd_ok(hugepd_t hpd) -{ - return 0; -} -#define is_hugepd(pdep) 0 -#endif /* CONFIG_DEBUG_VM */ - -#endif /* CONFIG_HUGETLB_PAGE */ +#define H_PUD_TABLE_SIZE (sizeof(pud_t) << PUD_INDEX_SIZE) +#define H_PGD_TABLE_SIZE (sizeof(pgd_t) << PGD_INDEX_SIZE) #ifdef CONFIG_TRANSPARENT_HUGEPAGE -extern unsigned long pmd_hugepage_update(struct mm_struct *mm, - unsigned long addr, - pmd_t *pmdp, - unsigned long clr, - unsigned long set); static inline char *get_hpte_slot_array(pmd_t *pmdp) { /* @@ -253,50 +177,35 @@ static inline void mark_hpte_slot_valid(unsigned char *hpte_slot_array, * that for explicit huge pages. * */ -static inline int pmd_trans_huge(pmd_t pmd) +static inline int hash__pmd_trans_huge(pmd_t pmd) { - return !!((pmd_val(pmd) & (_PAGE_PTE | _PAGE_THP_HUGE)) == - (_PAGE_PTE | _PAGE_THP_HUGE)); + return !!((pmd_val(pmd) & (_PAGE_PTE | H_PAGE_THP_HUGE)) == + (_PAGE_PTE | H_PAGE_THP_HUGE)); } -static inline int pmd_large(pmd_t pmd) +static inline int hash__pmd_same(pmd_t pmd_a, pmd_t pmd_b) { - return !!(pmd_val(pmd) & _PAGE_PTE); + return (((pmd_raw(pmd_a) ^ pmd_raw(pmd_b)) & ~cpu_to_be64(_PAGE_HPTEFLAGS)) == 0); } -static inline pmd_t pmd_mknotpresent(pmd_t pmd) +static inline pmd_t hash__pmd_mkhuge(pmd_t pmd) { - return __pmd(pmd_val(pmd) & ~_PAGE_PRESENT); -} - -#define __HAVE_ARCH_PMD_SAME -static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) -{ - return (((pmd_val(pmd_a) ^ pmd_val(pmd_b)) & ~_PAGE_HPTEFLAGS) == 0); -} - -static inline int __pmdp_test_and_clear_young(struct mm_struct *mm, - unsigned long addr, pmd_t *pmdp) -{ - unsigned long old; - - if ((pmd_val(*pmdp) & (_PAGE_ACCESSED | _PAGE_HASHPTE)) == 0) - return 0; - old = pmd_hugepage_update(mm, addr, pmdp, _PAGE_ACCESSED, 0); - return ((old & _PAGE_ACCESSED) != 0); -} - -#define __HAVE_ARCH_PMDP_SET_WRPROTECT -static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp) -{ - - if ((pmd_val(*pmdp) & _PAGE_RW) == 0) - return; - - pmd_hugepage_update(mm, addr, pmdp, _PAGE_RW, 0); + return __pmd(pmd_val(pmd) | (_PAGE_PTE | H_PAGE_THP_HUGE)); } +extern unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp, + unsigned long clr, unsigned long set); +extern pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp); +extern void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable); +extern pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); +extern void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp); +extern pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp); +extern int hash__has_transparent_hugepage(void); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* __ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index d0ee6fcef823..f61cad3de4e6 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -13,48 +13,12 @@ * We could create separate kernel read-only if we used the 3 PP bits * combinations that newer processors provide but we currently don't. */ -#define _PAGE_BIT_SWAP_TYPE 0 - -#define _PAGE_EXEC 0x00001 /* execute permission */ -#define _PAGE_RW 0x00002 /* read & write access allowed */ -#define _PAGE_READ 0x00004 /* read access allowed */ -#define _PAGE_USER 0x00008 /* page may be accessed by userspace */ -#define _PAGE_GUARDED 0x00010 /* G: guarded (side-effect) page */ -/* M (memory coherence) is always set in the HPTE, so we don't need it here */ -#define _PAGE_COHERENT 0x0 -#define _PAGE_NO_CACHE 0x00020 /* I: cache inhibit */ -#define _PAGE_WRITETHRU 0x00040 /* W: cache write-through */ -#define _PAGE_DIRTY 0x00080 /* C: page changed */ -#define _PAGE_ACCESSED 0x00100 /* R: page referenced */ -#define _PAGE_SPECIAL 0x00400 /* software: special page */ -#define _PAGE_BUSY 0x00800 /* software: PTE & hash are busy */ - -#ifdef CONFIG_MEM_SOFT_DIRTY -#define _PAGE_SOFT_DIRTY 0x200 /* software: software dirty tracking */ -#else -#define _PAGE_SOFT_DIRTY 0x000 -#endif - -#define _PAGE_F_GIX_SHIFT 57 -#define _PAGE_F_GIX (7ul << 57) /* HPTE index within HPTEG */ -#define _PAGE_F_SECOND (1ul << 60) /* HPTE is in 2ndary HPTEG */ -#define _PAGE_HASHPTE (1ul << 61) /* PTE has associated HPTE */ -#define _PAGE_PTE (1ul << 62) /* distinguishes PTEs from pointers */ -#define _PAGE_PRESENT (1ul << 63) /* pte contains a translation */ - -/* - * We need to differentiate between explicit huge page and THP huge - * page, since THP huge page also need to track real subpage details - */ -#define _PAGE_THP_HUGE _PAGE_4K_PFN - -/* - * set of bits not changed in pmd_modify. - */ -#define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \ - _PAGE_ACCESSED | _PAGE_THP_HUGE | _PAGE_PTE | \ - _PAGE_SOFT_DIRTY) - +#define H_PAGE_BUSY 0x00800 /* software: PTE & hash are busy */ +#define H_PTE_NONE_MASK _PAGE_HPTEFLAGS +#define H_PAGE_F_GIX_SHIFT 57 +#define H_PAGE_F_GIX (7ul << 57) /* HPTE index within HPTEG */ +#define H_PAGE_F_SECOND (1ul << 60) /* HPTE is in 2ndary HPTEG */ +#define H_PAGE_HASHPTE (1ul << 61) /* PTE has associated HPTE */ #ifdef CONFIG_PPC_64K_PAGES #include @@ -65,29 +29,33 @@ /* * Size of EA range mapped by our pagetables. */ -#define PGTABLE_EADDR_SIZE (PTE_INDEX_SIZE + PMD_INDEX_SIZE + \ - PUD_INDEX_SIZE + PGD_INDEX_SIZE + PAGE_SHIFT) -#define PGTABLE_RANGE (ASM_CONST(1) << PGTABLE_EADDR_SIZE) +#define H_PGTABLE_EADDR_SIZE (H_PTE_INDEX_SIZE + H_PMD_INDEX_SIZE + \ + H_PUD_INDEX_SIZE + H_PGD_INDEX_SIZE + PAGE_SHIFT) +#define H_PGTABLE_RANGE (ASM_CONST(1) << H_PGTABLE_EADDR_SIZE) #ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define PMD_CACHE_INDEX (PMD_INDEX_SIZE + 1) +/* + * only with hash we need to use the second half of pmd page table + * to store pointer to deposited pgtable_t + */ +#define H_PMD_CACHE_INDEX (H_PMD_INDEX_SIZE + 1) #else -#define PMD_CACHE_INDEX PMD_INDEX_SIZE +#define H_PMD_CACHE_INDEX H_PMD_INDEX_SIZE #endif /* * Define the address range of the kernel non-linear virtual area */ -#define KERN_VIRT_START ASM_CONST(0xD000000000000000) -#define KERN_VIRT_SIZE ASM_CONST(0x0000100000000000) +#define H_KERN_VIRT_START ASM_CONST(0xD000000000000000) +#define H_KERN_VIRT_SIZE ASM_CONST(0x0000100000000000) /* * The vmalloc space starts at the beginning of that region, and * occupies half of it on hash CPUs and a quarter of it on Book3E * (we keep a quarter for the virtual memmap) */ -#define VMALLOC_START KERN_VIRT_START -#define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1) -#define VMALLOC_END (VMALLOC_START + VMALLOC_SIZE) +#define H_VMALLOC_START H_KERN_VIRT_START +#define H_VMALLOC_SIZE (H_KERN_VIRT_SIZE >> 1) +#define H_VMALLOC_END (H_VMALLOC_START + H_VMALLOC_SIZE) /* * Region IDs @@ -96,7 +64,7 @@ #define REGION_MASK (0xfUL << REGION_SHIFT) #define REGION_ID(ea) (((unsigned long)(ea)) >> REGION_SHIFT) -#define VMALLOC_REGION_ID (REGION_ID(VMALLOC_START)) +#define VMALLOC_REGION_ID (REGION_ID(H_VMALLOC_START)) #define KERNEL_REGION_ID (REGION_ID(PAGE_OFFSET)) #define VMEMMAP_REGION_ID (0xfUL) /* Server only */ #define USER_REGION_ID (0UL) @@ -105,381 +73,97 @@ * Defines the address of the vmemap area, in its own region on * hash table CPUs. */ -#define VMEMMAP_BASE (VMEMMAP_REGION_ID << REGION_SHIFT) +#define H_VMEMMAP_BASE (VMEMMAP_REGION_ID << REGION_SHIFT) #ifdef CONFIG_PPC_MM_SLICES #define HAVE_ARCH_UNMAPPED_AREA #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN #endif /* CONFIG_PPC_MM_SLICES */ -/* No separate kernel read-only */ -#define _PAGE_KERNEL_RW (_PAGE_RW | _PAGE_DIRTY) /* user access blocked by key */ -#define _PAGE_KERNEL_RO _PAGE_KERNEL_RW -#define _PAGE_KERNEL_RWX (_PAGE_DIRTY | _PAGE_RW | _PAGE_EXEC) - -/* Strong Access Ordering */ -#define _PAGE_SAO (_PAGE_WRITETHRU | _PAGE_NO_CACHE | _PAGE_COHERENT) - -/* No page size encoding in the linux PTE */ -#define _PAGE_PSIZE 0 /* PTEIDX nibble */ #define _PTEIDX_SECONDARY 0x8 #define _PTEIDX_GROUP_IX 0x7 -/* Hash table based platforms need atomic updates of the linux PTE */ -#define PTE_ATOMIC_UPDATES 1 -#define _PTE_NONE_MASK _PAGE_HPTEFLAGS -/* - * The mask convered by the RPN must be a ULL on 32-bit platforms with - * 64-bit PTEs - */ -#define PTE_RPN_MASK (((1UL << PTE_RPN_SIZE) - 1) << PTE_RPN_SHIFT) -/* - * _PAGE_CHG_MASK masks of bits that are to be preserved across - * pgprot changes - */ -#define _PAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \ - _PAGE_ACCESSED | _PAGE_SPECIAL | _PAGE_PTE | \ - _PAGE_SOFT_DIRTY) -/* - * Mask of bits returned by pte_pgprot() - */ -#define PAGE_PROT_BITS (_PAGE_GUARDED | _PAGE_COHERENT | _PAGE_NO_CACHE | \ - _PAGE_WRITETHRU | _PAGE_4K_PFN | \ - _PAGE_USER | _PAGE_ACCESSED | \ - _PAGE_RW | _PAGE_DIRTY | _PAGE_EXEC | \ - _PAGE_SOFT_DIRTY) -/* - * We define 2 sets of base prot bits, one for basic pages (ie, - * cacheable kernel and user pages) and one for non cacheable - * pages. We always set _PAGE_COHERENT when SMP is enabled or - * the processor might need it for DMA coherency. - */ -#define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_PSIZE) -#define _PAGE_BASE (_PAGE_BASE_NC | _PAGE_COHERENT) - -/* Permission masks used to generate the __P and __S table, - * - * Note:__pgprot is defined in arch/powerpc/include/asm/page.h - * - * Write permissions imply read permissions for now (we could make write-only - * pages on BookE but we don't bother for now). Execute permission control is - * possible on platforms that define _PAGE_EXEC - * - * Note due to the way vm flags are laid out, the bits are XWR - */ -#define PAGE_NONE __pgprot(_PAGE_BASE) -#define PAGE_SHARED __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW) -#define PAGE_SHARED_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW | \ - _PAGE_EXEC) -#define PAGE_COPY __pgprot(_PAGE_BASE | _PAGE_USER ) -#define PAGE_COPY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC) -#define PAGE_READONLY __pgprot(_PAGE_BASE | _PAGE_USER ) -#define PAGE_READONLY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC) - -#define __P000 PAGE_NONE -#define __P001 PAGE_READONLY -#define __P010 PAGE_COPY -#define __P011 PAGE_COPY -#define __P100 PAGE_READONLY_X -#define __P101 PAGE_READONLY_X -#define __P110 PAGE_COPY_X -#define __P111 PAGE_COPY_X - -#define __S000 PAGE_NONE -#define __S001 PAGE_READONLY -#define __S010 PAGE_SHARED -#define __S011 PAGE_SHARED -#define __S100 PAGE_READONLY_X -#define __S101 PAGE_READONLY_X -#define __S110 PAGE_SHARED_X -#define __S111 PAGE_SHARED_X - -/* Permission masks used for kernel mappings */ -#define PAGE_KERNEL __pgprot(_PAGE_BASE | _PAGE_KERNEL_RW) -#define PAGE_KERNEL_NC __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \ - _PAGE_NO_CACHE) -#define PAGE_KERNEL_NCG __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \ - _PAGE_NO_CACHE | _PAGE_GUARDED) -#define PAGE_KERNEL_X __pgprot(_PAGE_BASE | _PAGE_KERNEL_RWX) -#define PAGE_KERNEL_RO __pgprot(_PAGE_BASE | _PAGE_KERNEL_RO) -#define PAGE_KERNEL_ROX __pgprot(_PAGE_BASE | _PAGE_KERNEL_ROX) - -/* Protection used for kernel text. We want the debuggers to be able to - * set breakpoints anywhere, so don't write protect the kernel text - * on platforms where such control is possible. - */ -#if defined(CONFIG_KGDB) || defined(CONFIG_XMON) || defined(CONFIG_BDI_SWITCH) ||\ - defined(CONFIG_KPROBES) || defined(CONFIG_DYNAMIC_FTRACE) -#define PAGE_KERNEL_TEXT PAGE_KERNEL_X -#else -#define PAGE_KERNEL_TEXT PAGE_KERNEL_ROX -#endif - -/* Make modules code happy. We don't set RO yet */ -#define PAGE_KERNEL_EXEC PAGE_KERNEL_X -#define PAGE_AGP (PAGE_KERNEL_NC) - -#define PMD_BAD_BITS (PTE_TABLE_SIZE-1) -#define PUD_BAD_BITS (PMD_TABLE_SIZE-1) +#define H_PMD_BAD_BITS (PTE_TABLE_SIZE-1) +#define H_PUD_BAD_BITS (PMD_TABLE_SIZE-1) #ifndef __ASSEMBLY__ -#define pmd_bad(pmd) (pmd_val(pmd) & PMD_BAD_BITS) -#define pmd_page_vaddr(pmd) __va(pmd_val(pmd) & ~PMD_MASKED_BITS) - -#define pud_bad(pud) (pud_val(pud) & PUD_BAD_BITS) -#define pud_page_vaddr(pud) __va(pud_val(pud) & ~PUD_MASKED_BITS) - -/* Pointers in the page table tree are physical addresses */ -#define __pgtable_ptr_val(ptr) __pa(ptr) - -#define pgd_index(address) (((address) >> (PGDIR_SHIFT)) & (PTRS_PER_PGD - 1)) -#define pud_index(address) (((address) >> (PUD_SHIFT)) & (PTRS_PER_PUD - 1)) -#define pmd_index(address) (((address) >> (PMD_SHIFT)) & (PTRS_PER_PMD - 1)) -#define pte_index(address) (((address) >> (PAGE_SHIFT)) & (PTRS_PER_PTE - 1)) +#define hash__pmd_bad(pmd) (pmd_val(pmd) & H_PMD_BAD_BITS) +#define hash__pud_bad(pud) (pud_val(pud) & H_PUD_BAD_BITS) +static inline int hash__pgd_bad(pgd_t pgd) +{ + return (pgd_val(pgd) == 0); +} extern void hpte_need_flush(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long pte, int huge); extern unsigned long htab_convert_pte_flags(unsigned long pteflags); /* Atomic PTE updates */ -static inline unsigned long pte_update(struct mm_struct *mm, - unsigned long addr, - pte_t *ptep, unsigned long clr, - unsigned long set, - int huge) +static inline unsigned long hash__pte_update(struct mm_struct *mm, + unsigned long addr, + pte_t *ptep, unsigned long clr, + unsigned long set, + int huge) { - unsigned long old, tmp; + __be64 old_be, tmp_be; + unsigned long old; __asm__ __volatile__( "1: ldarx %0,0,%3 # pte_update\n\ - andi. %1,%0,%6\n\ + and. %1,%0,%6\n\ bne- 1b \n\ andc %1,%0,%4 \n\ or %1,%1,%7\n\ stdcx. %1,0,%3 \n\ bne- 1b" - : "=&r" (old), "=&r" (tmp), "=m" (*ptep) - : "r" (ptep), "r" (clr), "m" (*ptep), "i" (_PAGE_BUSY), "r" (set) + : "=&r" (old_be), "=&r" (tmp_be), "=m" (*ptep) + : "r" (ptep), "r" (cpu_to_be64(clr)), "m" (*ptep), + "r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set)) : "cc" ); /* huge pages use the old page table lock */ if (!huge) assert_pte_locked(mm, addr); - if (old & _PAGE_HASHPTE) + old = be64_to_cpu(old_be); + if (old & H_PAGE_HASHPTE) hpte_need_flush(mm, addr, ptep, old, huge); return old; } -static inline int __ptep_test_and_clear_young(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - unsigned long old; - - if ((pte_val(*ptep) & (_PAGE_ACCESSED | _PAGE_HASHPTE)) == 0) - return 0; - old = pte_update(mm, addr, ptep, _PAGE_ACCESSED, 0, 0); - return (old & _PAGE_ACCESSED) != 0; -} -#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG -#define ptep_test_and_clear_young(__vma, __addr, __ptep) \ -({ \ - int __r; \ - __r = __ptep_test_and_clear_young((__vma)->vm_mm, __addr, __ptep); \ - __r; \ -}) - -#define __HAVE_ARCH_PTEP_SET_WRPROTECT -static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) -{ - - if ((pte_val(*ptep) & _PAGE_RW) == 0) - return; - - pte_update(mm, addr, ptep, _PAGE_RW, 0, 0); -} - -static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - if ((pte_val(*ptep) & _PAGE_RW) == 0) - return; - - pte_update(mm, addr, ptep, _PAGE_RW, 0, 1); -} - -/* - * We currently remove entries from the hashtable regardless of whether - * the entry was young or dirty. The generic routines only flush if the - * entry was young or dirty which is not good enough. - * - * We should be more intelligent about this but for the moment we override - * these functions and force a tlb flush unconditionally - */ -#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH -#define ptep_clear_flush_young(__vma, __address, __ptep) \ -({ \ - int __young = __ptep_test_and_clear_young((__vma)->vm_mm, __address, \ - __ptep); \ - __young; \ -}) - -#define __HAVE_ARCH_PTEP_GET_AND_CLEAR -static inline pte_t ptep_get_and_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - unsigned long old = pte_update(mm, addr, ptep, ~0UL, 0, 0); - return __pte(old); -} - -static inline void pte_clear(struct mm_struct *mm, unsigned long addr, - pte_t * ptep) -{ - pte_update(mm, addr, ptep, ~0UL, 0, 0); -} - - /* Set the dirty and/or accessed bits atomically in a linux PTE, this * function doesn't need to flush the hash entry */ -static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry) +static inline void hash__ptep_set_access_flags(pte_t *ptep, pte_t entry) { - unsigned long bits = pte_val(entry) & - (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC | - _PAGE_SOFT_DIRTY); + __be64 old, tmp, val, mask; - unsigned long old, tmp; + mask = cpu_to_be64(_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_READ | _PAGE_WRITE | + _PAGE_EXEC | _PAGE_SOFT_DIRTY); + + val = pte_raw(entry) & mask; __asm__ __volatile__( "1: ldarx %0,0,%4\n\ - andi. %1,%0,%6\n\ + and. %1,%0,%6\n\ bne- 1b \n\ or %0,%3,%0\n\ stdcx. %0,0,%4\n\ bne- 1b" :"=&r" (old), "=&r" (tmp), "=m" (*ptep) - :"r" (bits), "r" (ptep), "m" (*ptep), "i" (_PAGE_BUSY) + :"r" (val), "r" (ptep), "m" (*ptep), "r" (cpu_to_be64(H_PAGE_BUSY)) :"cc"); } -static inline int pgd_bad(pgd_t pgd) +static inline int hash__pte_same(pte_t pte_a, pte_t pte_b) { - return (pgd_val(pgd) == 0); + return (((pte_raw(pte_a) ^ pte_raw(pte_b)) & ~cpu_to_be64(_PAGE_HPTEFLAGS)) == 0); } -#define __HAVE_ARCH_PTE_SAME -#define pte_same(A,B) (((pte_val(A) ^ pte_val(B)) & ~_PAGE_HPTEFLAGS) == 0) -static inline unsigned long pgd_page_vaddr(pgd_t pgd) +static inline int hash__pte_none(pte_t pte) { - return (unsigned long)__va(pgd_val(pgd) & ~PGD_MASKED_BITS); -} - - -/* Generic accessors to PTE bits */ -static inline int pte_write(pte_t pte) { return !!(pte_val(pte) & _PAGE_RW);} -static inline int pte_dirty(pte_t pte) { return !!(pte_val(pte) & _PAGE_DIRTY); } -static inline int pte_young(pte_t pte) { return !!(pte_val(pte) & _PAGE_ACCESSED); } -static inline int pte_special(pte_t pte) { return !!(pte_val(pte) & _PAGE_SPECIAL); } -static inline int pte_none(pte_t pte) { return (pte_val(pte) & ~_PTE_NONE_MASK) == 0; } -static inline pgprot_t pte_pgprot(pte_t pte) { return __pgprot(pte_val(pte) & PAGE_PROT_BITS); } - -#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY -static inline bool pte_soft_dirty(pte_t pte) -{ - return !!(pte_val(pte) & _PAGE_SOFT_DIRTY); -} -static inline pte_t pte_mksoft_dirty(pte_t pte) -{ - return __pte(pte_val(pte) | _PAGE_SOFT_DIRTY); -} - -static inline pte_t pte_clear_soft_dirty(pte_t pte) -{ - return __pte(pte_val(pte) & ~_PAGE_SOFT_DIRTY); -} -#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ - -#ifdef CONFIG_NUMA_BALANCING -/* - * These work without NUMA balancing but the kernel does not care. See the - * comment in include/asm-generic/pgtable.h . On powerpc, this will only - * work for user pages and always return true for kernel pages. - */ -static inline int pte_protnone(pte_t pte) -{ - return (pte_val(pte) & - (_PAGE_PRESENT | _PAGE_USER)) == _PAGE_PRESENT; -} -#endif /* CONFIG_NUMA_BALANCING */ - -static inline int pte_present(pte_t pte) -{ - return !!(pte_val(pte) & _PAGE_PRESENT); -} - -/* Conversion functions: convert a page and protection to a page entry, - * and a page entry and page directory to the page they refer to. - * - * Even if PTEs can be unsigned long long, a PFN is always an unsigned - * long for now. - */ -static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot) -{ - return __pte((((pte_basic_t)(pfn) << PTE_RPN_SHIFT) & PTE_RPN_MASK) | - pgprot_val(pgprot)); -} - -static inline unsigned long pte_pfn(pte_t pte) -{ - return (pte_val(pte) & PTE_RPN_MASK) >> PTE_RPN_SHIFT; -} - -/* Generic modifiers for PTE bits */ -static inline pte_t pte_wrprotect(pte_t pte) -{ - return __pte(pte_val(pte) & ~_PAGE_RW); -} - -static inline pte_t pte_mkclean(pte_t pte) -{ - return __pte(pte_val(pte) & ~_PAGE_DIRTY); -} - -static inline pte_t pte_mkold(pte_t pte) -{ - return __pte(pte_val(pte) & ~_PAGE_ACCESSED); -} - -static inline pte_t pte_mkwrite(pte_t pte) -{ - return __pte(pte_val(pte) | _PAGE_RW); -} - -static inline pte_t pte_mkdirty(pte_t pte) -{ - return __pte(pte_val(pte) | _PAGE_DIRTY | _PAGE_SOFT_DIRTY); -} - -static inline pte_t pte_mkyoung(pte_t pte) -{ - return __pte(pte_val(pte) | _PAGE_ACCESSED); -} - -static inline pte_t pte_mkspecial(pte_t pte) -{ - return __pte(pte_val(pte) | _PAGE_SPECIAL); -} - -static inline pte_t pte_mkhuge(pte_t pte) -{ - return pte; -} - -static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) -{ - return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot)); + return (pte_val(pte) & ~H_PTE_NONE_MASK) == 0; } /* This low level function performs the actual PTE insertion @@ -487,8 +171,8 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) * an horrible mess that I'm not going to try to clean up now but * I'm keeping it in one place rather than spread around */ -static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte, int percpu) +static inline void hash__set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, int percpu) { /* * Anything else just stores the PTE normally. That covers all 64-bit @@ -497,53 +181,6 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, *ptep = pte; } -/* - * Macro to mark a page protection value as "uncacheable". - */ - -#define _PAGE_CACHE_CTL (_PAGE_COHERENT | _PAGE_GUARDED | _PAGE_NO_CACHE | \ - _PAGE_WRITETHRU) - -#define pgprot_noncached pgprot_noncached -static inline pgprot_t pgprot_noncached(pgprot_t prot) -{ - return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) | - _PAGE_NO_CACHE | _PAGE_GUARDED); -} - -#define pgprot_noncached_wc pgprot_noncached_wc -static inline pgprot_t pgprot_noncached_wc(pgprot_t prot) -{ - return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) | - _PAGE_NO_CACHE); -} - -#define pgprot_cached pgprot_cached -static inline pgprot_t pgprot_cached(pgprot_t prot) -{ - return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) | - _PAGE_COHERENT); -} - -#define pgprot_cached_wthru pgprot_cached_wthru -static inline pgprot_t pgprot_cached_wthru(pgprot_t prot) -{ - return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) | - _PAGE_COHERENT | _PAGE_WRITETHRU); -} - -#define pgprot_cached_noncoherent pgprot_cached_noncoherent -static inline pgprot_t pgprot_cached_noncoherent(pgprot_t prot) -{ - return __pgprot(pgprot_val(prot) & ~_PAGE_CACHE_CTL); -} - -#define pgprot_writecombine pgprot_writecombine -static inline pgprot_t pgprot_writecombine(pgprot_t prot) -{ - return pgprot_noncached_wc(prot); -} - #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, unsigned long old_pmd); @@ -556,6 +193,14 @@ static inline void hpte_do_hugepage_flush(struct mm_struct *mm, } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +extern int hash__map_kernel_page(unsigned long ea, unsigned long pa, + unsigned long flags); +extern int __meminit hash__vmemmap_create_mapping(unsigned long start, + unsigned long page_size, + unsigned long phys); +extern void hash__vmemmap_remove_mapping(unsigned long start, + unsigned long page_size); #endif /* !__ASSEMBLY__ */ #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_BOOK3S_64_HASH_H */ diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb-radix.h b/arch/powerpc/include/asm/book3s/64/hugetlb-radix.h new file mode 100644 index 000000000000..60f47649306f --- /dev/null +++ b/arch/powerpc/include/asm/book3s/64/hugetlb-radix.h @@ -0,0 +1,14 @@ +#ifndef _ASM_POWERPC_BOOK3S_64_HUGETLB_RADIX_H +#define _ASM_POWERPC_BOOK3S_64_HUGETLB_RADIX_H +/* + * For radix we want generic code to handle hugetlb. But then if we want + * both hash and radix to be enabled together we need to workaround the + * limitations. + */ +void radix__flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr); +void radix__local_flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr); +extern unsigned long +radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags); +#endif diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index 0cea4807e26f..290157e8d5b2 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -1,5 +1,5 @@ -#ifndef _ASM_POWERPC_MMU_HASH64_H_ -#define _ASM_POWERPC_MMU_HASH64_H_ +#ifndef _ASM_POWERPC_BOOK3S_64_MMU_HASH_H_ +#define _ASM_POWERPC_BOOK3S_64_MMU_HASH_H_ /* * PowerPC64 memory management structures * @@ -78,6 +78,10 @@ #define HPTE_V_SECONDARY ASM_CONST(0x0000000000000002) #define HPTE_V_VALID ASM_CONST(0x0000000000000001) +/* + * ISA 3.0 have a different HPTE format. + */ +#define HPTE_R_3_0_SSIZE_SHIFT 58 #define HPTE_R_PP0 ASM_CONST(0x8000000000000000) #define HPTE_R_TS ASM_CONST(0x4000000000000000) #define HPTE_R_KEY_HI ASM_CONST(0x3000000000000000) @@ -115,6 +119,7 @@ #define POWER7_TLB_SETS 128 /* # sets in POWER7 TLB */ #define POWER8_TLB_SETS 512 /* # sets in POWER8 TLB */ #define POWER9_TLB_SETS_HASH 256 /* # sets in POWER9 TLB Hash mode */ +#define POWER9_TLB_SETS_RADIX 128 /* # sets in POWER9 TLB Radix mode */ #ifndef __ASSEMBLY__ @@ -127,24 +132,6 @@ extern struct hash_pte *htab_address; extern unsigned long htab_size_bytes; extern unsigned long htab_hash_mask; -/* - * Page size definition - * - * shift : is the "PAGE_SHIFT" value for that page size - * sllp : is a bit mask with the value of SLB L || LP to be or'ed - * directly to a slbmte "vsid" value - * penc : is the HPTE encoding mask for the "LP" field: - * - */ -struct mmu_psize_def -{ - unsigned int shift; /* number of bits */ - int penc[MMU_PAGE_COUNT]; /* HPTE encoding */ - unsigned int tlbiel; /* tlbiel supported for that page size */ - unsigned long avpnm; /* bits to mask out in AVPN in the HPTE */ - unsigned long sllp; /* SLB L||LP (exact mask to use in slbmte) */ -}; -extern struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT]; static inline int shift_to_mmu_psize(unsigned int shift) { @@ -210,11 +197,6 @@ static inline int segment_shift(int ssize) /* * The current system page and segment sizes */ -extern int mmu_linear_psize; -extern int mmu_virtual_psize; -extern int mmu_vmalloc_psize; -extern int mmu_vmemmap_psize; -extern int mmu_io_psize; extern int mmu_kernel_ssize; extern int mmu_highuser_ssize; extern u16 mmu_slb_size; @@ -247,7 +229,8 @@ static inline unsigned long hpte_encode_avpn(unsigned long vpn, int psize, */ v = (vpn >> (23 - VPN_SHIFT)) & ~(mmu_psize_defs[psize].avpnm); v <<= HPTE_V_AVPN_SHIFT; - v |= ((unsigned long) ssize) << HPTE_V_SSIZE_SHIFT; + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + v |= ((unsigned long) ssize) << HPTE_V_SSIZE_SHIFT; return v; } @@ -271,8 +254,12 @@ static inline unsigned long hpte_encode_v(unsigned long vpn, int base_psize, * aligned for the requested page size */ static inline unsigned long hpte_encode_r(unsigned long pa, int base_psize, - int actual_psize) + int actual_psize, int ssize) { + + if (cpu_has_feature(CPU_FTR_ARCH_300)) + pa |= ((unsigned long) ssize) << HPTE_R_3_0_SSIZE_SHIFT; + /* A 4K page needs no special encoding */ if (actual_psize == MMU_PAGE_4K) return pa & HPTE_R_RPN; @@ -476,7 +463,7 @@ extern void slb_set_size(u16 size); add rt,rt,rx /* 4 bits per slice and we have one slice per 1TB */ -#define SLICE_ARRAY_SIZE (PGTABLE_RANGE >> 41) +#define SLICE_ARRAY_SIZE (H_PGTABLE_RANGE >> 41) #ifndef __ASSEMBLY__ @@ -512,38 +499,6 @@ static inline void subpage_prot_free(struct mm_struct *mm) {} static inline void subpage_prot_init_new_context(struct mm_struct *mm) { } #endif /* CONFIG_PPC_SUBPAGE_PROT */ -typedef unsigned long mm_context_id_t; -struct spinlock; - -typedef struct { - mm_context_id_t id; - u16 user_psize; /* page size index */ - -#ifdef CONFIG_PPC_MM_SLICES - u64 low_slices_psize; /* SLB page size encodings */ - unsigned char high_slices_psize[SLICE_ARRAY_SIZE]; -#else - u16 sllp; /* SLB page size encoding */ -#endif - unsigned long vdso_base; -#ifdef CONFIG_PPC_SUBPAGE_PROT - struct subpage_prot_table spt; -#endif /* CONFIG_PPC_SUBPAGE_PROT */ -#ifdef CONFIG_PPC_ICSWX - struct spinlock *cop_lockp; /* guard acop and cop_pid */ - unsigned long acop; /* mask of enabled coprocessor types */ - unsigned int cop_pid; /* pid value used with coprocessors */ -#endif /* CONFIG_PPC_ICSWX */ -#ifdef CONFIG_PPC_64K_PAGES - /* for 4K PTE fragment support */ - void *pte_frag; -#endif -#ifdef CONFIG_SPAPR_TCE_IOMMU - struct list_head iommu_group_mem_list; -#endif -} mm_context_t; - - #if 0 /* * The code below is equivalent to this function for arguments @@ -579,7 +534,7 @@ static inline unsigned long get_vsid(unsigned long context, unsigned long ea, /* * Bad address. We return VSID 0 for that */ - if ((ea & ~REGION_MASK) >= PGTABLE_RANGE) + if ((ea & ~REGION_MASK) >= H_PGTABLE_RANGE) return 0; if (ssize == MMU_SEGSIZE_256M) @@ -613,4 +568,4 @@ unsigned htab_shift_for_mem_size(unsigned long mem_size); #endif /* __ASSEMBLY__ */ -#endif /* _ASM_POWERPC_MMU_HASH64_H_ */ +#endif /* _ASM_POWERPC_BOOK3S_64_MMU_HASH_H_ */ diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h new file mode 100644 index 000000000000..5854263d4d6e --- /dev/null +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -0,0 +1,137 @@ +#ifndef _ASM_POWERPC_BOOK3S_64_MMU_H_ +#define _ASM_POWERPC_BOOK3S_64_MMU_H_ + +#ifndef __ASSEMBLY__ +/* + * Page size definition + * + * shift : is the "PAGE_SHIFT" value for that page size + * sllp : is a bit mask with the value of SLB L || LP to be or'ed + * directly to a slbmte "vsid" value + * penc : is the HPTE encoding mask for the "LP" field: + * + */ +struct mmu_psize_def { + unsigned int shift; /* number of bits */ + int penc[MMU_PAGE_COUNT]; /* HPTE encoding */ + unsigned int tlbiel; /* tlbiel supported for that page size */ + unsigned long avpnm; /* bits to mask out in AVPN in the HPTE */ + union { + unsigned long sllp; /* SLB L||LP (exact mask to use in slbmte) */ + unsigned long ap; /* Ap encoding used by PowerISA 3.0 */ + }; +}; +extern struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT]; + +#define radix_enabled() mmu_has_feature(MMU_FTR_RADIX) + +#endif /* __ASSEMBLY__ */ + +/* 64-bit classic hash table MMU */ +#include + +#ifndef __ASSEMBLY__ +/* + * ISA 3.0 partiton and process table entry format + */ +struct prtb_entry { + __be64 prtb0; + __be64 prtb1; +}; +extern struct prtb_entry *process_tb; + +struct patb_entry { + __be64 patb0; + __be64 patb1; +}; +extern struct patb_entry *partition_tb; + +#define PATB_HR (1UL << 63) +#define PATB_GR (1UL << 63) +#define RPDB_MASK 0x0ffffffffffff00fUL +#define RPDB_SHIFT (1UL << 8) +/* + * Limit process table to PAGE_SIZE table. This + * also limit the max pid we can support. + * MAX_USER_CONTEXT * 16 bytes of space. + */ +#define PRTB_SIZE_SHIFT (CONTEXT_BITS + 4) +/* + * Power9 currently only support 64K partition table size. + */ +#define PATB_SIZE_SHIFT 16 + +typedef unsigned long mm_context_id_t; +struct spinlock; + +typedef struct { + mm_context_id_t id; + u16 user_psize; /* page size index */ + +#ifdef CONFIG_PPC_MM_SLICES + u64 low_slices_psize; /* SLB page size encodings */ + unsigned char high_slices_psize[SLICE_ARRAY_SIZE]; +#else + u16 sllp; /* SLB page size encoding */ +#endif + unsigned long vdso_base; +#ifdef CONFIG_PPC_SUBPAGE_PROT + struct subpage_prot_table spt; +#endif /* CONFIG_PPC_SUBPAGE_PROT */ +#ifdef CONFIG_PPC_ICSWX + struct spinlock *cop_lockp; /* guard acop and cop_pid */ + unsigned long acop; /* mask of enabled coprocessor types */ + unsigned int cop_pid; /* pid value used with coprocessors */ +#endif /* CONFIG_PPC_ICSWX */ +#ifdef CONFIG_PPC_64K_PAGES + /* for 4K PTE fragment support */ + void *pte_frag; +#endif +#ifdef CONFIG_SPAPR_TCE_IOMMU + struct list_head iommu_group_mem_list; +#endif +} mm_context_t; + +/* + * The current system page and segment sizes + */ +extern int mmu_linear_psize; +extern int mmu_virtual_psize; +extern int mmu_vmalloc_psize; +extern int mmu_vmemmap_psize; +extern int mmu_io_psize; + +/* MMU initialization */ +extern void radix_init_native(void); +extern void hash__early_init_mmu(void); +extern void radix__early_init_mmu(void); +static inline void early_init_mmu(void) +{ + if (radix_enabled()) + return radix__early_init_mmu(); + return hash__early_init_mmu(); +} +extern void hash__early_init_mmu_secondary(void); +extern void radix__early_init_mmu_secondary(void); +static inline void early_init_mmu_secondary(void) +{ + if (radix_enabled()) + return radix__early_init_mmu_secondary(); + return hash__early_init_mmu_secondary(); +} + +extern void hash__setup_initial_memory_limit(phys_addr_t first_memblock_base, + phys_addr_t first_memblock_size); +extern void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base, + phys_addr_t first_memblock_size); +static inline void setup_initial_memory_limit(phys_addr_t first_memblock_base, + phys_addr_t first_memblock_size) +{ + if (radix_enabled()) + return radix__setup_initial_memory_limit(first_memblock_base, + first_memblock_size); + return hash__setup_initial_memory_limit(first_memblock_base, + first_memblock_size); +} +#endif /* __ASSEMBLY__ */ +#endif /* _ASM_POWERPC_BOOK3S_64_MMU_H_ */ diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h b/arch/powerpc/include/asm/book3s/64/pgalloc.h new file mode 100644 index 000000000000..488279edb1f0 --- /dev/null +++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h @@ -0,0 +1,207 @@ +#ifndef _ASM_POWERPC_BOOK3S_64_PGALLOC_H +#define _ASM_POWERPC_BOOK3S_64_PGALLOC_H +/* + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include + +struct vmemmap_backing { + struct vmemmap_backing *list; + unsigned long phys; + unsigned long virt_addr; +}; +extern struct vmemmap_backing *vmemmap_list; + +/* + * Functions that deal with pagetables that could be at any level of + * the table need to be passed an "index_size" so they know how to + * handle allocation. For PTE pages (which are linked to a struct + * page for now, and drawn from the main get_free_pages() pool), the + * allocation size will be (2^index_size * sizeof(pointer)) and + * allocations are drawn from the kmem_cache in PGT_CACHE(index_size). + * + * The maximum index size needs to be big enough to allow any + * pagetable sizes we need, but small enough to fit in the low bits of + * any page table pointer. In other words all pagetables, even tiny + * ones, must be aligned to allow at least enough low 0 bits to + * contain this value. This value is also used as a mask, so it must + * be one less than a power of two. + */ +#define MAX_PGTABLE_INDEX_SIZE 0xf + +extern struct kmem_cache *pgtable_cache[]; +#define PGT_CACHE(shift) ({ \ + BUG_ON(!(shift)); \ + pgtable_cache[(shift) - 1]; \ + }) + +#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO + +extern pte_t *pte_fragment_alloc(struct mm_struct *, unsigned long, int); +extern void pte_fragment_free(unsigned long *, int); +extern void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift); +#ifdef CONFIG_SMP +extern void __tlb_remove_table(void *_table); +#endif + +static inline pgd_t *radix__pgd_alloc(struct mm_struct *mm) +{ +#ifdef CONFIG_PPC_64K_PAGES + return (pgd_t *)__get_free_page(PGALLOC_GFP); +#else + struct page *page; + page = alloc_pages(PGALLOC_GFP, 4); + if (!page) + return NULL; + return (pgd_t *) page_address(page); +#endif +} + +static inline void radix__pgd_free(struct mm_struct *mm, pgd_t *pgd) +{ +#ifdef CONFIG_PPC_64K_PAGES + free_page((unsigned long)pgd); +#else + free_pages((unsigned long)pgd, 4); +#endif +} + +static inline pgd_t *pgd_alloc(struct mm_struct *mm) +{ + if (radix_enabled()) + return radix__pgd_alloc(mm); + return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE), GFP_KERNEL); +} + +static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) +{ + if (radix_enabled()) + return radix__pgd_free(mm, pgd); + kmem_cache_free(PGT_CACHE(PGD_INDEX_SIZE), pgd); +} + +static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) +{ + pgd_set(pgd, __pgtable_ptr_val(pud) | PGD_VAL_BITS); +} + +static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) +{ + return kmem_cache_alloc(PGT_CACHE(PUD_INDEX_SIZE), + GFP_KERNEL|__GFP_REPEAT); +} + +static inline void pud_free(struct mm_struct *mm, pud_t *pud) +{ + kmem_cache_free(PGT_CACHE(PUD_INDEX_SIZE), pud); +} + +static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) +{ + pud_set(pud, __pgtable_ptr_val(pmd) | PUD_VAL_BITS); +} + +static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, + unsigned long address) +{ + pgtable_free_tlb(tlb, pud, PUD_INDEX_SIZE); +} + +static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) +{ + return kmem_cache_alloc(PGT_CACHE(PMD_CACHE_INDEX), + GFP_KERNEL|__GFP_REPEAT); +} + +static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) +{ + kmem_cache_free(PGT_CACHE(PMD_CACHE_INDEX), pmd); +} + +static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, + unsigned long address) +{ + return pgtable_free_tlb(tlb, pmd, PMD_CACHE_INDEX); +} + +static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, + pte_t *pte) +{ + pmd_set(pmd, __pgtable_ptr_val(pte) | PMD_VAL_BITS); +} + +static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, + pgtable_t pte_page) +{ + pmd_set(pmd, __pgtable_ptr_val(pte_page) | PMD_VAL_BITS); +} + +static inline pgtable_t pmd_pgtable(pmd_t pmd) +{ + return (pgtable_t)pmd_page_vaddr(pmd); +} + +#ifdef CONFIG_PPC_4K_PAGES +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, + unsigned long address) +{ + return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO); +} + +static inline pgtable_t pte_alloc_one(struct mm_struct *mm, + unsigned long address) +{ + struct page *page; + pte_t *pte; + + pte = pte_alloc_one_kernel(mm, address); + if (!pte) + return NULL; + page = virt_to_page(pte); + if (!pgtable_page_ctor(page)) { + __free_page(page); + return NULL; + } + return pte; +} +#else /* if CONFIG_PPC_64K_PAGES */ + +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, + unsigned long address) +{ + return (pte_t *)pte_fragment_alloc(mm, address, 1); +} + +static inline pgtable_t pte_alloc_one(struct mm_struct *mm, + unsigned long address) +{ + return (pgtable_t)pte_fragment_alloc(mm, address, 0); +} +#endif + +static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) +{ + pte_fragment_free((unsigned long *)pte, 1); +} + +static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage) +{ + pte_fragment_free((unsigned long *)ptepage, 0); +} + +static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table, + unsigned long address) +{ + tlb_flush_pgtable(tlb, address); + pgtable_free_tlb(tlb, table, 0); +} + +#define check_pgt_cache() do { } while (0) + +#endif /* _ASM_POWERPC_BOOK3S_64_PGALLOC_H */ diff --git a/arch/powerpc/include/asm/book3s/64/pgtable-4k.h b/arch/powerpc/include/asm/book3s/64/pgtable-4k.h new file mode 100644 index 000000000000..71e9abced493 --- /dev/null +++ b/arch/powerpc/include/asm/book3s/64/pgtable-4k.h @@ -0,0 +1,53 @@ +#ifndef _ASM_POWERPC_BOOK3S_64_PGTABLE_4K_H +#define _ASM_POWERPC_BOOK3S_64_PGTABLE_4K_H +/* + * hash 4k can't share hugetlb and also doesn't support THP + */ +#ifndef __ASSEMBLY__ +#ifdef CONFIG_HUGETLB_PAGE +static inline int pmd_huge(pmd_t pmd) +{ + /* + * leaf pte for huge page + */ + if (radix_enabled()) + return !!(pmd_val(pmd) & _PAGE_PTE); + return 0; +} + +static inline int pud_huge(pud_t pud) +{ + /* + * leaf pte for huge page + */ + if (radix_enabled()) + return !!(pud_val(pud) & _PAGE_PTE); + return 0; +} + +static inline int pgd_huge(pgd_t pgd) +{ + /* + * leaf pte for huge page + */ + if (radix_enabled()) + return !!(pgd_val(pgd) & _PAGE_PTE); + return 0; +} +#define pgd_huge pgd_huge +/* + * With radix , we have hugepage ptes in the pud and pmd entries. We don't + * need to setup hugepage directory for them. Our pte and page directory format + * enable us to have this enabled. + */ +static inline int hugepd_ok(hugepd_t hpd) +{ + if (radix_enabled()) + return 0; + return hash__hugepd_ok(hpd); +} +#define is_hugepd(hpd) (hugepd_ok(hpd)) +#endif /* CONFIG_HUGETLB_PAGE */ +#endif /* __ASSEMBLY__ */ + +#endif /*_ASM_POWERPC_BOOK3S_64_PGTABLE_4K_H */ diff --git a/arch/powerpc/include/asm/book3s/64/pgtable-64k.h b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h new file mode 100644 index 000000000000..cb2d0a5fa3f8 --- /dev/null +++ b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h @@ -0,0 +1,64 @@ +#ifndef _ASM_POWERPC_BOOK3S_64_PGTABLE_64K_H +#define _ASM_POWERPC_BOOK3S_64_PGTABLE_64K_H + +#ifndef __ASSEMBLY__ +#ifdef CONFIG_HUGETLB_PAGE +/* + * We have PGD_INDEX_SIZ = 12 and PTE_INDEX_SIZE = 8, so that we can have + * 16GB hugepage pte in PGD and 16MB hugepage pte at PMD; + * + * Defined in such a way that we can optimize away code block at build time + * if CONFIG_HUGETLB_PAGE=n. + */ +static inline int pmd_huge(pmd_t pmd) +{ + /* + * leaf pte for huge page + */ + return !!(pmd_val(pmd) & _PAGE_PTE); +} + +static inline int pud_huge(pud_t pud) +{ + /* + * leaf pte for huge page + */ + return !!(pud_val(pud) & _PAGE_PTE); +} + +static inline int pgd_huge(pgd_t pgd) +{ + /* + * leaf pte for huge page + */ + return !!(pgd_val(pgd) & _PAGE_PTE); +} +#define pgd_huge pgd_huge + +#ifdef CONFIG_DEBUG_VM +extern int hugepd_ok(hugepd_t hpd); +#define is_hugepd(hpd) (hugepd_ok(hpd)) +#else +/* + * With 64k page size, we have hugepage ptes in the pgd and pmd entries. We don't + * need to setup hugepage directory for them. Our pte and page directory format + * enable us to have this enabled. + */ +static inline int hugepd_ok(hugepd_t hpd) +{ + return 0; +} +#define is_hugepd(pdep) 0 +#endif /* CONFIG_DEBUG_VM */ + +#endif /* CONFIG_HUGETLB_PAGE */ + +static inline int remap_4k_pfn(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, pgprot_t prot) +{ + if (radix_enabled()) + BUG(); + return hash__remap_4k_pfn(vma, addr, pfn, prot); +} +#endif /* __ASSEMBLY__ */ +#endif /*_ASM_POWERPC_BOOK3S_64_PGTABLE_64K_H */ diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 8fe6f6b48aa5..88a5ecaa157b 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -1,13 +1,247 @@ #ifndef _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ #define _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ + /* - * This file contains the functions and defines necessary to modify and use - * the ppc64 hashed page table. + * Common bits between hash and Radix page table */ +#define _PAGE_BIT_SWAP_TYPE 0 + +#define _PAGE_EXEC 0x00001 /* execute permission */ +#define _PAGE_WRITE 0x00002 /* write access allowed */ +#define _PAGE_READ 0x00004 /* read access allowed */ +#define _PAGE_RW (_PAGE_READ | _PAGE_WRITE) +#define _PAGE_RWX (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC) +#define _PAGE_PRIVILEGED 0x00008 /* kernel access only */ +#define _PAGE_SAO 0x00010 /* Strong access order */ +#define _PAGE_NON_IDEMPOTENT 0x00020 /* non idempotent memory */ +#define _PAGE_TOLERANT 0x00030 /* tolerant memory, cache inhibited */ +#define _PAGE_DIRTY 0x00080 /* C: page changed */ +#define _PAGE_ACCESSED 0x00100 /* R: page referenced */ +/* + * Software bits + */ +#define _RPAGE_SW0 0x2000000000000000UL +#define _RPAGE_SW1 0x00800 +#define _RPAGE_SW2 0x00400 +#define _RPAGE_SW3 0x00200 +#ifdef CONFIG_MEM_SOFT_DIRTY +#define _PAGE_SOFT_DIRTY _RPAGE_SW3 /* software: software dirty tracking */ +#else +#define _PAGE_SOFT_DIRTY 0x00000 +#endif +#define _PAGE_SPECIAL _RPAGE_SW2 /* software: special page */ + + +#define _PAGE_PTE (1ul << 62) /* distinguishes PTEs from pointers */ +#define _PAGE_PRESENT (1ul << 63) /* pte contains a translation */ +/* + * Drivers request for cache inhibited pte mapping using _PAGE_NO_CACHE + * Instead of fixing all of them, add an alternate define which + * maps CI pte mapping. + */ +#define _PAGE_NO_CACHE _PAGE_TOLERANT +/* + * We support 57 bit real address in pte. Clear everything above 57, and + * every thing below PAGE_SHIFT; + */ +#define PTE_RPN_MASK (((1UL << 57) - 1) & (PAGE_MASK)) +/* + * set of bits not changed in pmd_modify. Even though we have hash specific bits + * in here, on radix we expect them to be zero. + */ +#define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \ + _PAGE_ACCESSED | H_PAGE_THP_HUGE | _PAGE_PTE | \ + _PAGE_SOFT_DIRTY) +/* + * user access blocked by key + */ +#define _PAGE_KERNEL_RW (_PAGE_PRIVILEGED | _PAGE_RW | _PAGE_DIRTY) +#define _PAGE_KERNEL_RO (_PAGE_PRIVILEGED | _PAGE_READ) +#define _PAGE_KERNEL_RWX (_PAGE_PRIVILEGED | _PAGE_DIRTY | \ + _PAGE_RW | _PAGE_EXEC) +/* + * No page size encoding in the linux PTE + */ +#define _PAGE_PSIZE 0 +/* + * _PAGE_CHG_MASK masks of bits that are to be preserved across + * pgprot changes + */ +#define _PAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \ + _PAGE_ACCESSED | _PAGE_SPECIAL | _PAGE_PTE | \ + _PAGE_SOFT_DIRTY) +/* + * Mask of bits returned by pte_pgprot() + */ +#define PAGE_PROT_BITS (_PAGE_SAO | _PAGE_NON_IDEMPOTENT | _PAGE_TOLERANT | \ + H_PAGE_4K_PFN | _PAGE_PRIVILEGED | _PAGE_ACCESSED | \ + _PAGE_READ | _PAGE_WRITE | _PAGE_DIRTY | _PAGE_EXEC | \ + _PAGE_SOFT_DIRTY) +/* + * We define 2 sets of base prot bits, one for basic pages (ie, + * cacheable kernel and user pages) and one for non cacheable + * pages. We always set _PAGE_COHERENT when SMP is enabled or + * the processor might need it for DMA coherency. + */ +#define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_PSIZE) +#define _PAGE_BASE (_PAGE_BASE_NC) + +/* Permission masks used to generate the __P and __S table, + * + * Note:__pgprot is defined in arch/powerpc/include/asm/page.h + * + * Write permissions imply read permissions for now (we could make write-only + * pages on BookE but we don't bother for now). Execute permission control is + * possible on platforms that define _PAGE_EXEC + * + * Note due to the way vm flags are laid out, the bits are XWR + */ +#define PAGE_NONE __pgprot(_PAGE_BASE | _PAGE_PRIVILEGED) +#define PAGE_SHARED __pgprot(_PAGE_BASE | _PAGE_RW) +#define PAGE_SHARED_X __pgprot(_PAGE_BASE | _PAGE_RW | _PAGE_EXEC) +#define PAGE_COPY __pgprot(_PAGE_BASE | _PAGE_READ) +#define PAGE_COPY_X __pgprot(_PAGE_BASE | _PAGE_READ | _PAGE_EXEC) +#define PAGE_READONLY __pgprot(_PAGE_BASE | _PAGE_READ) +#define PAGE_READONLY_X __pgprot(_PAGE_BASE | _PAGE_READ | _PAGE_EXEC) + +#define __P000 PAGE_NONE +#define __P001 PAGE_READONLY +#define __P010 PAGE_COPY +#define __P011 PAGE_COPY +#define __P100 PAGE_READONLY_X +#define __P101 PAGE_READONLY_X +#define __P110 PAGE_COPY_X +#define __P111 PAGE_COPY_X + +#define __S000 PAGE_NONE +#define __S001 PAGE_READONLY +#define __S010 PAGE_SHARED +#define __S011 PAGE_SHARED +#define __S100 PAGE_READONLY_X +#define __S101 PAGE_READONLY_X +#define __S110 PAGE_SHARED_X +#define __S111 PAGE_SHARED_X + +/* Permission masks used for kernel mappings */ +#define PAGE_KERNEL __pgprot(_PAGE_BASE | _PAGE_KERNEL_RW) +#define PAGE_KERNEL_NC __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \ + _PAGE_TOLERANT) +#define PAGE_KERNEL_NCG __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \ + _PAGE_NON_IDEMPOTENT) +#define PAGE_KERNEL_X __pgprot(_PAGE_BASE | _PAGE_KERNEL_RWX) +#define PAGE_KERNEL_RO __pgprot(_PAGE_BASE | _PAGE_KERNEL_RO) +#define PAGE_KERNEL_ROX __pgprot(_PAGE_BASE | _PAGE_KERNEL_ROX) + +/* + * Protection used for kernel text. We want the debuggers to be able to + * set breakpoints anywhere, so don't write protect the kernel text + * on platforms where such control is possible. + */ +#if defined(CONFIG_KGDB) || defined(CONFIG_XMON) || defined(CONFIG_BDI_SWITCH) || \ + defined(CONFIG_KPROBES) || defined(CONFIG_DYNAMIC_FTRACE) +#define PAGE_KERNEL_TEXT PAGE_KERNEL_X +#else +#define PAGE_KERNEL_TEXT PAGE_KERNEL_ROX +#endif + +/* Make modules code happy. We don't set RO yet */ +#define PAGE_KERNEL_EXEC PAGE_KERNEL_X +#define PAGE_AGP (PAGE_KERNEL_NC) + +#ifndef __ASSEMBLY__ +/* + * page table defines + */ +extern unsigned long __pte_index_size; +extern unsigned long __pmd_index_size; +extern unsigned long __pud_index_size; +extern unsigned long __pgd_index_size; +extern unsigned long __pmd_cache_index; +#define PTE_INDEX_SIZE __pte_index_size +#define PMD_INDEX_SIZE __pmd_index_size +#define PUD_INDEX_SIZE __pud_index_size +#define PGD_INDEX_SIZE __pgd_index_size +#define PMD_CACHE_INDEX __pmd_cache_index +/* + * Because of use of pte fragments and THP, size of page table + * are not always derived out of index size above. + */ +extern unsigned long __pte_table_size; +extern unsigned long __pmd_table_size; +extern unsigned long __pud_table_size; +extern unsigned long __pgd_table_size; +#define PTE_TABLE_SIZE __pte_table_size +#define PMD_TABLE_SIZE __pmd_table_size +#define PUD_TABLE_SIZE __pud_table_size +#define PGD_TABLE_SIZE __pgd_table_size + +extern unsigned long __pmd_val_bits; +extern unsigned long __pud_val_bits; +extern unsigned long __pgd_val_bits; +#define PMD_VAL_BITS __pmd_val_bits +#define PUD_VAL_BITS __pud_val_bits +#define PGD_VAL_BITS __pgd_val_bits + +extern unsigned long __pte_frag_nr; +#define PTE_FRAG_NR __pte_frag_nr +extern unsigned long __pte_frag_size_shift; +#define PTE_FRAG_SIZE_SHIFT __pte_frag_size_shift +#define PTE_FRAG_SIZE (1UL << PTE_FRAG_SIZE_SHIFT) +/* + * Pgtable size used by swapper, init in asm code + */ +#define MAX_PGD_TABLE_SIZE (sizeof(pgd_t) << RADIX_PGD_INDEX_SIZE) + +#define PTRS_PER_PTE (1 << PTE_INDEX_SIZE) +#define PTRS_PER_PMD (1 << PMD_INDEX_SIZE) +#define PTRS_PER_PUD (1 << PUD_INDEX_SIZE) +#define PTRS_PER_PGD (1 << PGD_INDEX_SIZE) + +/* PMD_SHIFT determines what a second-level page table entry can map */ +#define PMD_SHIFT (PAGE_SHIFT + PTE_INDEX_SIZE) +#define PMD_SIZE (1UL << PMD_SHIFT) +#define PMD_MASK (~(PMD_SIZE-1)) + +/* PUD_SHIFT determines what a third-level page table entry can map */ +#define PUD_SHIFT (PMD_SHIFT + PMD_INDEX_SIZE) +#define PUD_SIZE (1UL << PUD_SHIFT) +#define PUD_MASK (~(PUD_SIZE-1)) + +/* PGDIR_SHIFT determines what a fourth-level page table entry can map */ +#define PGDIR_SHIFT (PUD_SHIFT + PUD_INDEX_SIZE) +#define PGDIR_SIZE (1UL << PGDIR_SHIFT) +#define PGDIR_MASK (~(PGDIR_SIZE-1)) + +/* Bits to mask out from a PMD to get to the PTE page */ +#define PMD_MASKED_BITS 0xc0000000000000ffUL +/* Bits to mask out from a PUD to get to the PMD page */ +#define PUD_MASKED_BITS 0xc0000000000000ffUL +/* Bits to mask out from a PGD to get to the PUD page */ +#define PGD_MASKED_BITS 0xc0000000000000ffUL + +extern unsigned long __vmalloc_start; +extern unsigned long __vmalloc_end; +#define VMALLOC_START __vmalloc_start +#define VMALLOC_END __vmalloc_end + +extern unsigned long __kernel_virt_start; +extern unsigned long __kernel_virt_size; +#define KERN_VIRT_START __kernel_virt_start +#define KERN_VIRT_SIZE __kernel_virt_size +extern struct page *vmemmap; +extern unsigned long ioremap_bot; +#endif /* __ASSEMBLY__ */ #include -#include +#include +#ifdef CONFIG_PPC_64K_PAGES +#include +#else +#include +#endif + +#include /* * The second half of the kernel virtual space is used for IO mappings, * it's itself carved into the PIO region (ISA and PHB IO space) and @@ -26,8 +260,6 @@ #define IOREMAP_BASE (PHB_IO_END) #define IOREMAP_END (KERN_VIRT_START + KERN_VIRT_SIZE) -#define vmemmap ((struct page *)VMEMMAP_BASE) - /* Advertise special mapping type for AGP */ #define HAVE_PAGE_AGP @@ -45,7 +277,7 @@ #define __real_pte(e,p) ((real_pte_t){(e)}) #define __rpte_to_pte(r) ((r).pte) -#define __rpte_to_hidx(r,index) (pte_val(__rpte_to_pte(r)) >>_PAGE_F_GIX_SHIFT) +#define __rpte_to_hidx(r,index) (pte_val(__rpte_to_pte(r)) >> H_PAGE_F_GIX_SHIFT) #define pte_iterate_hashed_subpages(rpte, psize, va, index, shift) \ do { \ @@ -62,6 +294,327 @@ #endif /* __real_pte */ +static inline unsigned long pte_update(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned long clr, + unsigned long set, int huge) +{ + if (radix_enabled()) + return radix__pte_update(mm, addr, ptep, clr, set, huge); + return hash__pte_update(mm, addr, ptep, clr, set, huge); +} +/* + * For hash even if we have _PAGE_ACCESSED = 0, we do a pte_update. + * We currently remove entries from the hashtable regardless of whether + * the entry was young or dirty. + * + * We should be more intelligent about this but for the moment we override + * these functions and force a tlb flush unconditionally + * For radix: H_PAGE_HASHPTE should be zero. Hence we can use the same + * function for both hash and radix. + */ +static inline int __ptep_test_and_clear_young(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + unsigned long old; + + if ((pte_val(*ptep) & (_PAGE_ACCESSED | H_PAGE_HASHPTE)) == 0) + return 0; + old = pte_update(mm, addr, ptep, _PAGE_ACCESSED, 0, 0); + return (old & _PAGE_ACCESSED) != 0; +} + +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG +#define ptep_test_and_clear_young(__vma, __addr, __ptep) \ +({ \ + int __r; \ + __r = __ptep_test_and_clear_young((__vma)->vm_mm, __addr, __ptep); \ + __r; \ +}) + +#define __HAVE_ARCH_PTEP_SET_WRPROTECT +static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ + + if ((pte_val(*ptep) & _PAGE_WRITE) == 0) + return; + + pte_update(mm, addr, ptep, _PAGE_WRITE, 0, 0); +} + +static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + if ((pte_val(*ptep) & _PAGE_WRITE) == 0) + return; + + pte_update(mm, addr, ptep, _PAGE_WRITE, 0, 1); +} + +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR +static inline pte_t ptep_get_and_clear(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + unsigned long old = pte_update(mm, addr, ptep, ~0UL, 0, 0); + return __pte(old); +} + +static inline void pte_clear(struct mm_struct *mm, unsigned long addr, + pte_t * ptep) +{ + pte_update(mm, addr, ptep, ~0UL, 0, 0); +} +static inline int pte_write(pte_t pte) { return !!(pte_val(pte) & _PAGE_WRITE);} +static inline int pte_dirty(pte_t pte) { return !!(pte_val(pte) & _PAGE_DIRTY); } +static inline int pte_young(pte_t pte) { return !!(pte_val(pte) & _PAGE_ACCESSED); } +static inline int pte_special(pte_t pte) { return !!(pte_val(pte) & _PAGE_SPECIAL); } +static inline pgprot_t pte_pgprot(pte_t pte) { return __pgprot(pte_val(pte) & PAGE_PROT_BITS); } + +#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY +static inline bool pte_soft_dirty(pte_t pte) +{ + return !!(pte_val(pte) & _PAGE_SOFT_DIRTY); +} +static inline pte_t pte_mksoft_dirty(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_SOFT_DIRTY); +} + +static inline pte_t pte_clear_soft_dirty(pte_t pte) +{ + return __pte(pte_val(pte) & ~_PAGE_SOFT_DIRTY); +} +#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ + +#ifdef CONFIG_NUMA_BALANCING +/* + * These work without NUMA balancing but the kernel does not care. See the + * comment in include/asm-generic/pgtable.h . On powerpc, this will only + * work for user pages and always return true for kernel pages. + */ +static inline int pte_protnone(pte_t pte) +{ + return (pte_val(pte) & (_PAGE_PRESENT | _PAGE_PRIVILEGED)) == + (_PAGE_PRESENT | _PAGE_PRIVILEGED); +} +#endif /* CONFIG_NUMA_BALANCING */ + +static inline int pte_present(pte_t pte) +{ + return !!(pte_val(pte) & _PAGE_PRESENT); +} +/* + * Conversion functions: convert a page and protection to a page entry, + * and a page entry and page directory to the page they refer to. + * + * Even if PTEs can be unsigned long long, a PFN is always an unsigned + * long for now. + */ +static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot) +{ + return __pte((((pte_basic_t)(pfn) << PAGE_SHIFT) & PTE_RPN_MASK) | + pgprot_val(pgprot)); +} + +static inline unsigned long pte_pfn(pte_t pte) +{ + return (pte_val(pte) & PTE_RPN_MASK) >> PAGE_SHIFT; +} + +/* Generic modifiers for PTE bits */ +static inline pte_t pte_wrprotect(pte_t pte) +{ + return __pte(pte_val(pte) & ~_PAGE_WRITE); +} + +static inline pte_t pte_mkclean(pte_t pte) +{ + return __pte(pte_val(pte) & ~_PAGE_DIRTY); +} + +static inline pte_t pte_mkold(pte_t pte) +{ + return __pte(pte_val(pte) & ~_PAGE_ACCESSED); +} + +static inline pte_t pte_mkwrite(pte_t pte) +{ + /* + * write implies read, hence set both + */ + return __pte(pte_val(pte) | _PAGE_RW); +} + +static inline pte_t pte_mkdirty(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_DIRTY | _PAGE_SOFT_DIRTY); +} + +static inline pte_t pte_mkyoung(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_ACCESSED); +} + +static inline pte_t pte_mkspecial(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_SPECIAL); +} + +static inline pte_t pte_mkhuge(pte_t pte) +{ + return pte; +} + +static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) +{ + /* FIXME!! check whether this need to be a conditional */ + return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot)); +} + +static inline bool pte_user(pte_t pte) +{ + return !(pte_val(pte) & _PAGE_PRIVILEGED); +} + +/* Encode and de-code a swap entry */ +#define MAX_SWAPFILES_CHECK() do { \ + BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS); \ + /* \ + * Don't have overlapping bits with _PAGE_HPTEFLAGS \ + * We filter HPTEFLAGS on set_pte. \ + */ \ + BUILD_BUG_ON(_PAGE_HPTEFLAGS & (0x1f << _PAGE_BIT_SWAP_TYPE)); \ + BUILD_BUG_ON(_PAGE_HPTEFLAGS & _PAGE_SWP_SOFT_DIRTY); \ + } while (0) +/* + * on pte we don't need handle RADIX_TREE_EXCEPTIONAL_SHIFT; + */ +#define SWP_TYPE_BITS 5 +#define __swp_type(x) (((x).val >> _PAGE_BIT_SWAP_TYPE) \ + & ((1UL << SWP_TYPE_BITS) - 1)) +#define __swp_offset(x) (((x).val & PTE_RPN_MASK) >> PAGE_SHIFT) +#define __swp_entry(type, offset) ((swp_entry_t) { \ + ((type) << _PAGE_BIT_SWAP_TYPE) \ + | (((offset) << PAGE_SHIFT) & PTE_RPN_MASK)}) +/* + * swp_entry_t must be independent of pte bits. We build a swp_entry_t from + * swap type and offset we get from swap and convert that to pte to find a + * matching pte in linux page table. + * Clear bits not found in swap entries here. + */ +#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) & ~_PAGE_PTE }) +#define __swp_entry_to_pte(x) __pte((x).val | _PAGE_PTE) + +#ifdef CONFIG_MEM_SOFT_DIRTY +#define _PAGE_SWP_SOFT_DIRTY (1UL << (SWP_TYPE_BITS + _PAGE_BIT_SWAP_TYPE)) +#else +#define _PAGE_SWP_SOFT_DIRTY 0UL +#endif /* CONFIG_MEM_SOFT_DIRTY */ + +#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY +static inline pte_t pte_swp_mksoft_dirty(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_SWP_SOFT_DIRTY); +} +static inline bool pte_swp_soft_dirty(pte_t pte) +{ + return !!(pte_val(pte) & _PAGE_SWP_SOFT_DIRTY); +} +static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) +{ + return __pte(pte_val(pte) & ~_PAGE_SWP_SOFT_DIRTY); +} +#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ + +static inline bool check_pte_access(unsigned long access, unsigned long ptev) +{ + /* + * This check for _PAGE_RWX and _PAGE_PRESENT bits + */ + if (access & ~ptev) + return false; + /* + * This check for access to privilege space + */ + if ((access & _PAGE_PRIVILEGED) != (ptev & _PAGE_PRIVILEGED)) + return false; + + return true; +} +/* + * Generic functions with hash/radix callbacks + */ + +static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry) +{ + if (radix_enabled()) + return radix__ptep_set_access_flags(ptep, entry); + return hash__ptep_set_access_flags(ptep, entry); +} + +#define __HAVE_ARCH_PTE_SAME +static inline int pte_same(pte_t pte_a, pte_t pte_b) +{ + if (radix_enabled()) + return radix__pte_same(pte_a, pte_b); + return hash__pte_same(pte_a, pte_b); +} + +static inline int pte_none(pte_t pte) +{ + if (radix_enabled()) + return radix__pte_none(pte); + return hash__pte_none(pte); +} + +static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, int percpu) +{ + if (radix_enabled()) + return radix__set_pte_at(mm, addr, ptep, pte, percpu); + return hash__set_pte_at(mm, addr, ptep, pte, percpu); +} + +#define _PAGE_CACHE_CTL (_PAGE_NON_IDEMPOTENT | _PAGE_TOLERANT) + +#define pgprot_noncached pgprot_noncached +static inline pgprot_t pgprot_noncached(pgprot_t prot) +{ + return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) | + _PAGE_NON_IDEMPOTENT); +} + +#define pgprot_noncached_wc pgprot_noncached_wc +static inline pgprot_t pgprot_noncached_wc(pgprot_t prot) +{ + return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) | + _PAGE_TOLERANT); +} + +#define pgprot_cached pgprot_cached +static inline pgprot_t pgprot_cached(pgprot_t prot) +{ + return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL)); +} + +#define pgprot_writecombine pgprot_writecombine +static inline pgprot_t pgprot_writecombine(pgprot_t prot) +{ + return pgprot_noncached_wc(prot); +} +/* + * check a pte mapping have cache inhibited property + */ +static inline bool pte_ci(pte_t pte) +{ + unsigned long pte_v = pte_val(pte); + + if (((pte_v & _PAGE_CACHE_CTL) == _PAGE_TOLERANT) || + ((pte_v & _PAGE_CACHE_CTL) == _PAGE_NON_IDEMPOTENT)) + return true; + return false; +} + static inline void pmd_set(pmd_t *pmdp, unsigned long val) { *pmdp = __pmd(val); @@ -75,6 +628,13 @@ static inline void pmd_clear(pmd_t *pmdp) #define pmd_none(pmd) (!pmd_val(pmd)) #define pmd_present(pmd) (!pmd_none(pmd)) +static inline int pmd_bad(pmd_t pmd) +{ + if (radix_enabled()) + return radix__pmd_bad(pmd); + return hash__pmd_bad(pmd); +} + static inline void pud_set(pud_t *pudp, unsigned long val) { *pudp = __pud(val); @@ -100,6 +660,15 @@ static inline pud_t pte_pud(pte_t pte) return __pud(pte_val(pte)); } #define pud_write(pud) pte_write(pud_pte(pud)) + +static inline int pud_bad(pud_t pud) +{ + if (radix_enabled()) + return radix__pud_bad(pud); + return hash__pud_bad(pud); +} + + #define pgd_write(pgd) pte_write(pgd_pte(pgd)) static inline void pgd_set(pgd_t *pgdp, unsigned long val) { @@ -124,8 +693,27 @@ static inline pgd_t pte_pgd(pte_t pte) return __pgd(pte_val(pte)); } +static inline int pgd_bad(pgd_t pgd) +{ + if (radix_enabled()) + return radix__pgd_bad(pgd); + return hash__pgd_bad(pgd); +} + extern struct page *pgd_page(pgd_t pgd); +/* Pointers in the page table tree are physical addresses */ +#define __pgtable_ptr_val(ptr) __pa(ptr) + +#define pmd_page_vaddr(pmd) __va(pmd_val(pmd) & ~PMD_MASKED_BITS) +#define pud_page_vaddr(pud) __va(pud_val(pud) & ~PUD_MASKED_BITS) +#define pgd_page_vaddr(pgd) __va(pgd_val(pgd) & ~PGD_MASKED_BITS) + +#define pgd_index(address) (((address) >> (PGDIR_SHIFT)) & (PTRS_PER_PGD - 1)) +#define pud_index(address) (((address) >> (PUD_SHIFT)) & (PTRS_PER_PUD - 1)) +#define pmd_index(address) (((address) >> (PMD_SHIFT)) & (PTRS_PER_PMD - 1)) +#define pte_index(address) (((address) >> (PAGE_SHIFT)) & (PTRS_PER_PTE - 1)) + /* * Find an entry in a page-table-directory. We combine the address region * (the high order N bits) and the pgd portion of the address. @@ -156,74 +744,42 @@ extern struct page *pgd_page(pgd_t pgd); #define pgd_ERROR(e) \ pr_err("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e)) -/* Encode and de-code a swap entry */ -#define MAX_SWAPFILES_CHECK() do { \ - BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS); \ - /* \ - * Don't have overlapping bits with _PAGE_HPTEFLAGS \ - * We filter HPTEFLAGS on set_pte. \ - */ \ - BUILD_BUG_ON(_PAGE_HPTEFLAGS & (0x1f << _PAGE_BIT_SWAP_TYPE)); \ - BUILD_BUG_ON(_PAGE_HPTEFLAGS & _PAGE_SWP_SOFT_DIRTY); \ - } while (0) -/* - * on pte we don't need handle RADIX_TREE_EXCEPTIONAL_SHIFT; - */ -#define SWP_TYPE_BITS 5 -#define __swp_type(x) (((x).val >> _PAGE_BIT_SWAP_TYPE) \ - & ((1UL << SWP_TYPE_BITS) - 1)) -#define __swp_offset(x) (((x).val & PTE_RPN_MASK) >> PTE_RPN_SHIFT) -#define __swp_entry(type, offset) ((swp_entry_t) { \ - ((type) << _PAGE_BIT_SWAP_TYPE) \ - | (((offset) << PTE_RPN_SHIFT) & PTE_RPN_MASK)}) -/* - * swp_entry_t must be independent of pte bits. We build a swp_entry_t from - * swap type and offset we get from swap and convert that to pte to find a - * matching pte in linux page table. - * Clear bits not found in swap entries here. - */ -#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) & ~_PAGE_PTE }) -#define __swp_entry_to_pte(x) __pte((x).val | _PAGE_PTE) - -#ifdef CONFIG_MEM_SOFT_DIRTY -#define _PAGE_SWP_SOFT_DIRTY (1UL << (SWP_TYPE_BITS + _PAGE_BIT_SWAP_TYPE)) -#else -#define _PAGE_SWP_SOFT_DIRTY 0UL -#endif /* CONFIG_MEM_SOFT_DIRTY */ - -#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY -static inline pte_t pte_swp_mksoft_dirty(pte_t pte) -{ - return __pte(pte_val(pte) | _PAGE_SWP_SOFT_DIRTY); -} -static inline bool pte_swp_soft_dirty(pte_t pte) -{ - return !!(pte_val(pte) & _PAGE_SWP_SOFT_DIRTY); -} -static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) -{ - return __pte(pte_val(pte) & ~_PAGE_SWP_SOFT_DIRTY); -} -#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ - void pgtable_cache_add(unsigned shift, void (*ctor)(void *)); void pgtable_cache_init(void); +static inline int map_kernel_page(unsigned long ea, unsigned long pa, + unsigned long flags) +{ + if (radix_enabled()) { +#if defined(CONFIG_PPC_RADIX_MMU) && defined(DEBUG_VM) + unsigned long page_size = 1 << mmu_psize_defs[mmu_io_psize].shift; + WARN((page_size != PAGE_SIZE), "I/O page size != PAGE_SIZE"); +#endif + return radix__map_kernel_page(ea, pa, __pgprot(flags), PAGE_SIZE); + } + return hash__map_kernel_page(ea, pa, flags); +} + +static inline int __meminit vmemmap_create_mapping(unsigned long start, + unsigned long page_size, + unsigned long phys) +{ + if (radix_enabled()) + return radix__vmemmap_create_mapping(start, page_size, phys); + return hash__vmemmap_create_mapping(start, page_size, phys); +} + +#ifdef CONFIG_MEMORY_HOTPLUG +static inline void vmemmap_remove_mapping(unsigned long start, + unsigned long page_size) +{ + if (radix_enabled()) + return radix__vmemmap_remove_mapping(start, page_size); + return hash__vmemmap_remove_mapping(start, page_size); +} +#endif struct page *realmode_pfn_to_page(unsigned long pfn); -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -extern pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot); -extern pmd_t mk_pmd(struct page *page, pgprot_t pgprot); -extern pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot); -extern void set_pmd_at(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp, pmd_t pmd); -extern void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd); -#define has_transparent_hugepage has_transparent_hugepage -extern int has_transparent_hugepage(void); -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ - - static inline pte_t pmd_pte(pmd_t pmd) { return __pte(pmd_val(pmd)); @@ -238,7 +794,6 @@ static inline pte_t *pmdp_ptep(pmd_t *pmd) { return (pte_t *)pmd; } - #define pmd_pfn(pmd) pte_pfn(pmd_pte(pmd)) #define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd)) #define pmd_young(pmd) pte_young(pmd_pte(pmd)) @@ -265,9 +820,87 @@ static inline int pmd_protnone(pmd_t pmd) #define __HAVE_ARCH_PMD_WRITE #define pmd_write(pmd) pte_write(pmd_pte(pmd)) +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +extern pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot); +extern pmd_t mk_pmd(struct page *page, pgprot_t pgprot); +extern pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot); +extern void set_pmd_at(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t pmd); +extern void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd); +extern int hash__has_transparent_hugepage(void); +static inline int has_transparent_hugepage(void) +{ + if (radix_enabled()) + return radix__has_transparent_hugepage(); + return hash__has_transparent_hugepage(); +} +#define has_transparent_hugepage has_transparent_hugepage + +static inline unsigned long +pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, + unsigned long clr, unsigned long set) +{ + if (radix_enabled()) + return radix__pmd_hugepage_update(mm, addr, pmdp, clr, set); + return hash__pmd_hugepage_update(mm, addr, pmdp, clr, set); +} + +static inline int pmd_large(pmd_t pmd) +{ + return !!(pmd_val(pmd) & _PAGE_PTE); +} + +static inline pmd_t pmd_mknotpresent(pmd_t pmd) +{ + return __pmd(pmd_val(pmd) & ~_PAGE_PRESENT); +} +/* + * For radix we should always find H_PAGE_HASHPTE zero. Hence + * the below will work for radix too + */ +static inline int __pmdp_test_and_clear_young(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + unsigned long old; + + if ((pmd_val(*pmdp) & (_PAGE_ACCESSED | H_PAGE_HASHPTE)) == 0) + return 0; + old = pmd_hugepage_update(mm, addr, pmdp, _PAGE_ACCESSED, 0); + return ((old & _PAGE_ACCESSED) != 0); +} + +#define __HAVE_ARCH_PMDP_SET_WRPROTECT +static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp) +{ + + if ((pmd_val(*pmdp) & _PAGE_WRITE) == 0) + return; + + pmd_hugepage_update(mm, addr, pmdp, _PAGE_WRITE, 0); +} + +static inline int pmd_trans_huge(pmd_t pmd) +{ + if (radix_enabled()) + return radix__pmd_trans_huge(pmd); + return hash__pmd_trans_huge(pmd); +} + +#define __HAVE_ARCH_PMD_SAME +static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) +{ + if (radix_enabled()) + return radix__pmd_same(pmd_a, pmd_b); + return hash__pmd_same(pmd_a, pmd_b); +} + static inline pmd_t pmd_mkhuge(pmd_t pmd) { - return __pmd(pmd_val(pmd) | (_PAGE_PTE | _PAGE_THP_HUGE)); + if (radix_enabled()) + return radix__pmd_mkhuge(pmd); + return hash__pmd_mkhuge(pmd); } #define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS @@ -278,37 +911,63 @@ extern int pmdp_set_access_flags(struct vm_area_struct *vma, #define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG extern int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); -#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH -extern int pmdp_clear_flush_young(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp); #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR -extern pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, - unsigned long addr, pmd_t *pmdp); +static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + if (radix_enabled()) + return radix__pmdp_huge_get_and_clear(mm, addr, pmdp); + return hash__pmdp_huge_get_and_clear(mm, addr, pmdp); +} -extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp); +static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + if (radix_enabled()) + return radix__pmdp_collapse_flush(vma, address, pmdp); + return hash__pmdp_collapse_flush(vma, address, pmdp); +} #define pmdp_collapse_flush pmdp_collapse_flush #define __HAVE_ARCH_PGTABLE_DEPOSIT -extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, - pgtable_t pgtable); +static inline void pgtable_trans_huge_deposit(struct mm_struct *mm, + pmd_t *pmdp, pgtable_t pgtable) +{ + if (radix_enabled()) + return radix__pgtable_trans_huge_deposit(mm, pmdp, pgtable); + return hash__pgtable_trans_huge_deposit(mm, pmdp, pgtable); +} + #define __HAVE_ARCH_PGTABLE_WITHDRAW -extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); +static inline pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, + pmd_t *pmdp) +{ + if (radix_enabled()) + return radix__pgtable_trans_huge_withdraw(mm, pmdp); + return hash__pgtable_trans_huge_withdraw(mm, pmdp); +} #define __HAVE_ARCH_PMDP_INVALIDATE extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #define __HAVE_ARCH_PMDP_HUGE_SPLIT_PREPARE -extern void pmdp_huge_split_prepare(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp); +static inline void pmdp_huge_split_prepare(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + if (radix_enabled()) + return radix__pmdp_huge_split_prepare(vma, address, pmdp); + return hash__pmdp_huge_split_prepare(vma, address, pmdp); +} #define pmd_move_must_withdraw pmd_move_must_withdraw struct spinlock; static inline int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl, struct spinlock *old_pmd_ptl) { + if (radix_enabled()) + return false; /* * Archs like ppc64 use pgtable to store per pmd * specific information. So when we switch the pmd, @@ -316,5 +975,6 @@ static inline int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl, */ return true; } +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ */ diff --git a/arch/powerpc/include/asm/book3s/64/radix-4k.h b/arch/powerpc/include/asm/book3s/64/radix-4k.h new file mode 100644 index 000000000000..7c3b1fe1619e --- /dev/null +++ b/arch/powerpc/include/asm/book3s/64/radix-4k.h @@ -0,0 +1,12 @@ +#ifndef _ASM_POWERPC_PGTABLE_RADIX_4K_H +#define _ASM_POWERPC_PGTABLE_RADIX_4K_H + +/* + * For 4K page size supported index is 13/9/9/9 + */ +#define RADIX_PTE_INDEX_SIZE 9 /* 2MB huge page */ +#define RADIX_PMD_INDEX_SIZE 9 /* 1G huge page */ +#define RADIX_PUD_INDEX_SIZE 9 +#define RADIX_PGD_INDEX_SIZE 13 + +#endif /* _ASM_POWERPC_PGTABLE_RADIX_4K_H */ diff --git a/arch/powerpc/include/asm/book3s/64/radix-64k.h b/arch/powerpc/include/asm/book3s/64/radix-64k.h new file mode 100644 index 000000000000..82dc355f0b45 --- /dev/null +++ b/arch/powerpc/include/asm/book3s/64/radix-64k.h @@ -0,0 +1,12 @@ +#ifndef _ASM_POWERPC_PGTABLE_RADIX_64K_H +#define _ASM_POWERPC_PGTABLE_RADIX_64K_H + +/* + * For 64K page size supported index is 13/9/9/5 + */ +#define RADIX_PTE_INDEX_SIZE 5 /* 2MB huge page */ +#define RADIX_PMD_INDEX_SIZE 9 /* 1G huge page */ +#define RADIX_PUD_INDEX_SIZE 9 +#define RADIX_PGD_INDEX_SIZE 13 + +#endif /* _ASM_POWERPC_PGTABLE_RADIX_64K_H */ diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h new file mode 100644 index 000000000000..937d4e247ac3 --- /dev/null +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -0,0 +1,232 @@ +#ifndef _ASM_POWERPC_PGTABLE_RADIX_H +#define _ASM_POWERPC_PGTABLE_RADIX_H + +#ifndef __ASSEMBLY__ +#include +#endif + +#ifdef CONFIG_PPC_64K_PAGES +#include +#else +#include +#endif + +/* An empty PTE can still have a R or C writeback */ +#define RADIX_PTE_NONE_MASK (_PAGE_DIRTY | _PAGE_ACCESSED) + +/* Bits to set in a RPMD/RPUD/RPGD */ +#define RADIX_PMD_VAL_BITS (0x8000000000000000UL | RADIX_PTE_INDEX_SIZE) +#define RADIX_PUD_VAL_BITS (0x8000000000000000UL | RADIX_PMD_INDEX_SIZE) +#define RADIX_PGD_VAL_BITS (0x8000000000000000UL | RADIX_PUD_INDEX_SIZE) + +/* Don't have anything in the reserved bits and leaf bits */ +#define RADIX_PMD_BAD_BITS 0x60000000000000e0UL +#define RADIX_PUD_BAD_BITS 0x60000000000000e0UL +#define RADIX_PGD_BAD_BITS 0x60000000000000e0UL + +/* + * Size of EA range mapped by our pagetables. + */ +#define RADIX_PGTABLE_EADDR_SIZE (RADIX_PTE_INDEX_SIZE + RADIX_PMD_INDEX_SIZE + \ + RADIX_PUD_INDEX_SIZE + RADIX_PGD_INDEX_SIZE + PAGE_SHIFT) +#define RADIX_PGTABLE_RANGE (ASM_CONST(1) << RADIX_PGTABLE_EADDR_SIZE) + +/* + * We support 52 bit address space, Use top bit for kernel + * virtual mapping. Also make sure kernel fit in the top + * quadrant. + * + * +------------------+ + * +------------------+ Kernel virtual map (0xc008000000000000) + * | | + * | | + * | | + * 0b11......+------------------+ Kernel linear map (0xc....) + * | | + * | 2 quadrant | + * | | + * 0b10......+------------------+ + * | | + * | 1 quadrant | + * | | + * 0b01......+------------------+ + * | | + * | 0 quadrant | + * | | + * 0b00......+------------------+ + * + * + * 3rd quadrant expanded: + * +------------------------------+ + * | | + * | | + * | | + * +------------------------------+ Kernel IO map end (0xc010000000000000) + * | | + * | | + * | 1/2 of virtual map | + * | | + * | | + * +------------------------------+ Kernel IO map start + * | | + * | 1/4 of virtual map | + * | | + * +------------------------------+ Kernel vmemap start + * | | + * | 1/4 of virtual map | + * | | + * +------------------------------+ Kernel virt start (0xc008000000000000) + * | | + * | | + * | | + * +------------------------------+ Kernel linear (0xc.....) + */ + +#define RADIX_KERN_VIRT_START ASM_CONST(0xc008000000000000) +#define RADIX_KERN_VIRT_SIZE ASM_CONST(0x0008000000000000) + +/* + * The vmalloc space starts at the beginning of that region, and + * occupies a quarter of it on radix config. + * (we keep a quarter for the virtual memmap) + */ +#define RADIX_VMALLOC_START RADIX_KERN_VIRT_START +#define RADIX_VMALLOC_SIZE (RADIX_KERN_VIRT_SIZE >> 2) +#define RADIX_VMALLOC_END (RADIX_VMALLOC_START + RADIX_VMALLOC_SIZE) +/* + * Defines the address of the vmemap area, in its own region on + * hash table CPUs. + */ +#define RADIX_VMEMMAP_BASE (RADIX_VMALLOC_END) + +#ifndef __ASSEMBLY__ +#define RADIX_PTE_TABLE_SIZE (sizeof(pte_t) << RADIX_PTE_INDEX_SIZE) +#define RADIX_PMD_TABLE_SIZE (sizeof(pmd_t) << RADIX_PMD_INDEX_SIZE) +#define RADIX_PUD_TABLE_SIZE (sizeof(pud_t) << RADIX_PUD_INDEX_SIZE) +#define RADIX_PGD_TABLE_SIZE (sizeof(pgd_t) << RADIX_PGD_INDEX_SIZE) + +static inline unsigned long radix__pte_update(struct mm_struct *mm, + unsigned long addr, + pte_t *ptep, unsigned long clr, + unsigned long set, + int huge) +{ + pte_t pte; + unsigned long old_pte, new_pte; + + do { + pte = READ_ONCE(*ptep); + old_pte = pte_val(pte); + new_pte = (old_pte | set) & ~clr; + + } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); + + /* We already do a sync in cmpxchg, is ptesync needed ?*/ + asm volatile("ptesync" : : : "memory"); + /* huge pages use the old page table lock */ + if (!huge) + assert_pte_locked(mm, addr); + + return old_pte; +} + +/* + * Set the dirty and/or accessed bits atomically in a linux PTE, this + * function doesn't need to invalidate tlb. + */ +static inline void radix__ptep_set_access_flags(pte_t *ptep, pte_t entry) +{ + pte_t pte; + unsigned long old_pte, new_pte; + unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED | + _PAGE_RW | _PAGE_EXEC); + do { + pte = READ_ONCE(*ptep); + old_pte = pte_val(pte); + new_pte = old_pte | set; + + } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); + + /* We already do a sync in cmpxchg, is ptesync needed ?*/ + asm volatile("ptesync" : : : "memory"); +} + +static inline int radix__pte_same(pte_t pte_a, pte_t pte_b) +{ + return ((pte_raw(pte_a) ^ pte_raw(pte_b)) == 0); +} + +static inline int radix__pte_none(pte_t pte) +{ + return (pte_val(pte) & ~RADIX_PTE_NONE_MASK) == 0; +} + +static inline void radix__set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, int percpu) +{ + *ptep = pte; + asm volatile("ptesync" : : : "memory"); +} + +static inline int radix__pmd_bad(pmd_t pmd) +{ + return !!(pmd_val(pmd) & RADIX_PMD_BAD_BITS); +} + +static inline int radix__pmd_same(pmd_t pmd_a, pmd_t pmd_b) +{ + return ((pmd_raw(pmd_a) ^ pmd_raw(pmd_b)) == 0); +} + +static inline int radix__pud_bad(pud_t pud) +{ + return !!(pud_val(pud) & RADIX_PUD_BAD_BITS); +} + + +static inline int radix__pgd_bad(pgd_t pgd) +{ + return !!(pgd_val(pgd) & RADIX_PGD_BAD_BITS); +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + +static inline int radix__pmd_trans_huge(pmd_t pmd) +{ + return !!(pmd_val(pmd) & _PAGE_PTE); +} + +static inline pmd_t radix__pmd_mkhuge(pmd_t pmd) +{ + return __pmd(pmd_val(pmd) | _PAGE_PTE); +} +static inline void radix__pmdp_huge_split_prepare(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + /* Nothing to do for radix. */ + return; +} + +extern unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, unsigned long clr, + unsigned long set); +extern pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp); +extern void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable); +extern pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); +extern pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp); +extern int radix__has_transparent_hugepage(void); +#endif + +extern int __meminit radix__vmemmap_create_mapping(unsigned long start, + unsigned long page_size, + unsigned long phys); +extern void radix__vmemmap_remove_mapping(unsigned long start, + unsigned long page_size); + +extern int radix__map_kernel_page(unsigned long ea, unsigned long pa, + pgprot_t flags, unsigned int psz); +#endif /* __ASSEMBLY__ */ +#endif diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h index 1b753f96b374..f12ddf5e8de5 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h @@ -1,8 +1,6 @@ #ifndef _ASM_POWERPC_BOOK3S_64_TLBFLUSH_HASH_H #define _ASM_POWERPC_BOOK3S_64_TLBFLUSH_HASH_H -#define MMU_NO_CONTEXT 0 - /* * TLB flushing for 64-bit hash-MMU CPUs */ @@ -29,14 +27,21 @@ extern void __flush_tlb_pending(struct ppc64_tlb_batch *batch); static inline void arch_enter_lazy_mmu_mode(void) { - struct ppc64_tlb_batch *batch = this_cpu_ptr(&ppc64_tlb_batch); + struct ppc64_tlb_batch *batch; + if (radix_enabled()) + return; + batch = this_cpu_ptr(&ppc64_tlb_batch); batch->active = 1; } static inline void arch_leave_lazy_mmu_mode(void) { - struct ppc64_tlb_batch *batch = this_cpu_ptr(&ppc64_tlb_batch); + struct ppc64_tlb_batch *batch; + + if (radix_enabled()) + return; + batch = this_cpu_ptr(&ppc64_tlb_batch); if (batch->index) __flush_tlb_pending(batch); @@ -52,40 +57,42 @@ extern void flush_hash_range(unsigned long number, int local); extern void flush_hash_hugepage(unsigned long vsid, unsigned long addr, pmd_t *pmdp, unsigned int psize, int ssize, unsigned long flags); - -static inline void local_flush_tlb_mm(struct mm_struct *mm) +static inline void hash__local_flush_tlb_mm(struct mm_struct *mm) { } -static inline void flush_tlb_mm(struct mm_struct *mm) +static inline void hash__flush_tlb_mm(struct mm_struct *mm) { } -static inline void local_flush_tlb_page(struct vm_area_struct *vma, - unsigned long vmaddr) +static inline void hash__local_flush_tlb_page(struct vm_area_struct *vma, + unsigned long vmaddr) { } -static inline void flush_tlb_page(struct vm_area_struct *vma, - unsigned long vmaddr) +static inline void hash__flush_tlb_page(struct vm_area_struct *vma, + unsigned long vmaddr) { } -static inline void flush_tlb_page_nohash(struct vm_area_struct *vma, - unsigned long vmaddr) +static inline void hash__flush_tlb_page_nohash(struct vm_area_struct *vma, + unsigned long vmaddr) { } -static inline void flush_tlb_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) +static inline void hash__flush_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) { } -static inline void flush_tlb_kernel_range(unsigned long start, - unsigned long end) +static inline void hash__flush_tlb_kernel_range(unsigned long start, + unsigned long end) { } + +struct mmu_gather; +extern void hash__tlb_flush(struct mmu_gather *tlb); /* Private function for use by PCI IO mapping code */ extern void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, unsigned long end); diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h new file mode 100644 index 000000000000..13ef38828dfe --- /dev/null +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h @@ -0,0 +1,33 @@ +#ifndef _ASM_POWERPC_TLBFLUSH_RADIX_H +#define _ASM_POWERPC_TLBFLUSH_RADIX_H + +struct vm_area_struct; +struct mm_struct; +struct mmu_gather; + +static inline int mmu_get_ap(int psize) +{ + return mmu_psize_defs[psize].ap; +} + +extern void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, + unsigned long end); +extern void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end); + +extern void radix__local_flush_tlb_mm(struct mm_struct *mm); +extern void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr); +extern void radix___local_flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, + unsigned long ap, int nid); +extern void radix__tlb_flush(struct mmu_gather *tlb); +#ifdef CONFIG_SMP +extern void radix__flush_tlb_mm(struct mm_struct *mm); +extern void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr); +extern void radix___flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, + unsigned long ap, int nid); +#else +#define radix__flush_tlb_mm(mm) radix__local_flush_tlb_mm(mm) +#define radix__flush_tlb_page(vma,addr) radix__local_flush_tlb_page(vma,addr) +#define radix___flush_tlb_page(mm,addr,p,i) radix___local_flush_tlb_page(mm,addr,p,i) +#endif + +#endif diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h new file mode 100644 index 000000000000..d98424ae356c --- /dev/null +++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h @@ -0,0 +1,76 @@ +#ifndef _ASM_POWERPC_BOOK3S_64_TLBFLUSH_H +#define _ASM_POWERPC_BOOK3S_64_TLBFLUSH_H + +#define MMU_NO_CONTEXT ~0UL + + +#include +#include + +static inline void flush_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + if (radix_enabled()) + return radix__flush_tlb_range(vma, start, end); + return hash__flush_tlb_range(vma, start, end); +} + +static inline void flush_tlb_kernel_range(unsigned long start, + unsigned long end) +{ + if (radix_enabled()) + return radix__flush_tlb_kernel_range(start, end); + return hash__flush_tlb_kernel_range(start, end); +} + +static inline void local_flush_tlb_mm(struct mm_struct *mm) +{ + if (radix_enabled()) + return radix__local_flush_tlb_mm(mm); + return hash__local_flush_tlb_mm(mm); +} + +static inline void local_flush_tlb_page(struct vm_area_struct *vma, + unsigned long vmaddr) +{ + if (radix_enabled()) + return radix__local_flush_tlb_page(vma, vmaddr); + return hash__local_flush_tlb_page(vma, vmaddr); +} + +static inline void flush_tlb_page_nohash(struct vm_area_struct *vma, + unsigned long vmaddr) +{ + if (radix_enabled()) + return radix__flush_tlb_page(vma, vmaddr); + return hash__flush_tlb_page_nohash(vma, vmaddr); +} + +static inline void tlb_flush(struct mmu_gather *tlb) +{ + if (radix_enabled()) + return radix__tlb_flush(tlb); + return hash__tlb_flush(tlb); +} + +#ifdef CONFIG_SMP +static inline void flush_tlb_mm(struct mm_struct *mm) +{ + if (radix_enabled()) + return radix__flush_tlb_mm(mm); + return hash__flush_tlb_mm(mm); +} + +static inline void flush_tlb_page(struct vm_area_struct *vma, + unsigned long vmaddr) +{ + if (radix_enabled()) + return radix__flush_tlb_page(vma, vmaddr); + return hash__flush_tlb_page(vma, vmaddr); +} +#else +#define flush_tlb_mm(mm) local_flush_tlb_mm(mm) +#define flush_tlb_page(vma, addr) local_flush_tlb_page(vma, addr) +#endif /* CONFIG_SMP */ + +#endif /* _ASM_POWERPC_BOOK3S_64_TLBFLUSH_H */ diff --git a/arch/powerpc/include/asm/book3s/pgalloc.h b/arch/powerpc/include/asm/book3s/pgalloc.h new file mode 100644 index 000000000000..54f591e9572e --- /dev/null +++ b/arch/powerpc/include/asm/book3s/pgalloc.h @@ -0,0 +1,19 @@ +#ifndef _ASM_POWERPC_BOOK3S_PGALLOC_H +#define _ASM_POWERPC_BOOK3S_PGALLOC_H + +#include + +extern void tlb_remove_table(struct mmu_gather *tlb, void *table); +static inline void tlb_flush_pgtable(struct mmu_gather *tlb, + unsigned long address) +{ + +} + +#ifdef CONFIG_PPC64 +#include +#else +#include +#endif + +#endif /* _ASM_POWERPC_BOOK3S_PGALLOC_H */ diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index 42814f0567cc..e2d9f4996e5c 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -8,6 +8,8 @@ extern struct kmem_cache *hugepte_cache; #ifdef CONFIG_PPC_BOOK3S_64 + +#include /* * This should work for other subarchs too. But right now we use the * new format only for 64bit book3s @@ -31,7 +33,19 @@ static inline unsigned int hugepd_shift(hugepd_t hpd) { return mmu_psize_to_shift(hugepd_mmu_psize(hpd)); } +static inline void flush_hugetlb_page(struct vm_area_struct *vma, + unsigned long vmaddr) +{ + if (radix_enabled()) + return radix__flush_hugetlb_page(vma, vmaddr); +} +static inline void __local_flush_hugetlb_page(struct vm_area_struct *vma, + unsigned long vmaddr) +{ + if (radix_enabled()) + return radix__local_flush_hugetlb_page(vma, vmaddr); +} #else static inline pte_t *hugepd_page(hugepd_t hpd) diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 7529aab068f5..1f4497fb5b83 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -276,19 +276,24 @@ static inline unsigned long hpte_make_readonly(unsigned long ptel) return ptel; } -static inline int hpte_cache_flags_ok(unsigned long ptel, unsigned long io_type) +static inline bool hpte_cache_flags_ok(unsigned long hptel, bool is_ci) { - unsigned int wimg = ptel & HPTE_R_WIMG; + unsigned int wimg = hptel & HPTE_R_WIMG; /* Handle SAO */ if (wimg == (HPTE_R_W | HPTE_R_I | HPTE_R_M) && cpu_has_feature(CPU_FTR_ARCH_206)) wimg = HPTE_R_M; - if (!io_type) + if (!is_ci) return wimg == HPTE_R_M; - - return (wimg & (HPTE_R_W | HPTE_R_I)) == io_type; + /* + * if host is mapped cache inhibited, make sure hptel also have + * cache inhibited. + */ + if (wimg & HPTE_R_W) /* FIXME!! is this ok for all guest. ? */ + return false; + return !!(wimg & HPTE_R_I); } /* @@ -305,9 +310,9 @@ static inline pte_t kvmppc_read_update_linux_pte(pte_t *ptep, int writing) */ old_pte = READ_ONCE(*ptep); /* - * wait until _PAGE_BUSY is clear then set it atomically + * wait until H_PAGE_BUSY is clear then set it atomically */ - if (unlikely(pte_val(old_pte) & _PAGE_BUSY)) { + if (unlikely(pte_val(old_pte) & H_PAGE_BUSY)) { cpu_relax(); continue; } @@ -319,27 +324,12 @@ static inline pte_t kvmppc_read_update_linux_pte(pte_t *ptep, int writing) if (writing && pte_write(old_pte)) new_pte = pte_mkdirty(new_pte); - if (pte_val(old_pte) == __cmpxchg_u64((unsigned long *)ptep, - pte_val(old_pte), - pte_val(new_pte))) { + if (pte_xchg(ptep, old_pte, new_pte)) break; - } } return new_pte; } - -/* Return HPTE cache control bits corresponding to Linux pte bits */ -static inline unsigned long hpte_cache_bits(unsigned long pte_val) -{ -#if _PAGE_NO_CACHE == HPTE_R_I && _PAGE_WRITETHRU == HPTE_R_W - return pte_val & (HPTE_R_W | HPTE_R_I); -#else - return ((pte_val & _PAGE_NO_CACHE) ? HPTE_R_I : 0) + - ((pte_val & _PAGE_WRITETHRU) ? HPTE_R_W : 0); -#endif -} - static inline bool hpte_read_permission(unsigned long pp, unsigned long key) { if (key) diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index fd22442d30a9..6bdcd0da9e21 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -256,6 +256,7 @@ struct machdep_calls { #ifdef CONFIG_ARCH_RANDOM int (*get_random_seed)(unsigned long *v); #endif + int (*update_partition_table)(u64); }; extern void e500_idle(void); diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index 8ca1c983bf6c..e53ebebff474 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -88,6 +88,11 @@ */ #define MMU_FTR_1T_SEGMENT ASM_CONST(0x40000000) +/* + * Radix page table available + */ +#define MMU_FTR_RADIX ASM_CONST(0x80000000) + /* MMU feature bit sets for various CPUs */ #define MMU_FTRS_DEFAULT_HPTE_ARCH_V2 \ MMU_FTR_HPTE_TABLE | MMU_FTR_PPCAS_ARCH_V2 @@ -110,9 +115,25 @@ DECLARE_PER_CPU(int, next_tlbcam_idx); #endif +enum { + MMU_FTRS_POSSIBLE = MMU_FTR_HPTE_TABLE | MMU_FTR_TYPE_8xx | + MMU_FTR_TYPE_40x | MMU_FTR_TYPE_44x | MMU_FTR_TYPE_FSL_E | + MMU_FTR_TYPE_47x | MMU_FTR_USE_HIGH_BATS | MMU_FTR_BIG_PHYS | + MMU_FTR_USE_TLBIVAX_BCAST | MMU_FTR_USE_TLBILX | + MMU_FTR_LOCK_BCAST_INVAL | MMU_FTR_NEED_DTLB_SW_LRU | + MMU_FTR_USE_TLBRSRV | MMU_FTR_USE_PAIRED_MAS | + MMU_FTR_NO_SLBIE_B | MMU_FTR_16M_PAGE | MMU_FTR_TLBIEL | + MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_CI_LARGE_PAGE | + MMU_FTR_1T_SEGMENT | +#ifdef CONFIG_PPC_RADIX_MMU + MMU_FTR_RADIX | +#endif + 0, +}; + static inline int mmu_has_feature(unsigned long feature) { - return (cur_cpu_spec->mmu_features & feature); + return (MMU_FTRS_POSSIBLE & cur_cpu_spec->mmu_features & feature); } static inline void mmu_clear_feature(unsigned long feature) @@ -122,13 +143,6 @@ static inline void mmu_clear_feature(unsigned long feature) extern unsigned int __start___mmu_ftr_fixup, __stop___mmu_ftr_fixup; -/* MMU initialization */ -extern void early_init_mmu(void); -extern void early_init_mmu_secondary(void); - -extern void setup_initial_memory_limit(phys_addr_t first_memblock_base, - phys_addr_t first_memblock_size); - #ifdef CONFIG_PPC64 /* This is our real memory area size on ppc64 server, on embedded, we * make it match the size our of bolted TLB area @@ -181,10 +195,20 @@ static inline void assert_pte_locked(struct mm_struct *mm, unsigned long addr) #define MMU_PAGE_COUNT 15 -#if defined(CONFIG_PPC_STD_MMU_64) -/* 64-bit classic hash table MMU */ -#include -#elif defined(CONFIG_PPC_STD_MMU_32) +#ifdef CONFIG_PPC_BOOK3S_64 +#include +#else /* CONFIG_PPC_BOOK3S_64 */ + +#ifndef __ASSEMBLY__ +/* MMU initialization */ +extern void early_init_mmu(void); +extern void early_init_mmu_secondary(void); +extern void setup_initial_memory_limit(phys_addr_t first_memblock_base, + phys_addr_t first_memblock_size); +#endif /* __ASSEMBLY__ */ +#endif + +#if defined(CONFIG_PPC_STD_MMU_32) /* 32-bit classic hash table MMU */ #include #elif defined(CONFIG_40x) @@ -201,6 +225,9 @@ static inline void assert_pte_locked(struct mm_struct *mm, unsigned long addr) # include #endif +#ifndef radix_enabled +#define radix_enabled() (0) +#endif #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_MMU_H_ */ diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index 4eaab40e3ade..9d2cd0c36ec2 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -33,16 +33,27 @@ extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem, extern long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem); extern void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem); #endif - -extern void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next); extern void switch_slb(struct task_struct *tsk, struct mm_struct *mm); extern void set_context(unsigned long id, pgd_t *pgd); #ifdef CONFIG_PPC_BOOK3S_64 +extern void radix__switch_mmu_context(struct mm_struct *prev, + struct mm_struct *next); +static inline void switch_mmu_context(struct mm_struct *prev, + struct mm_struct *next, + struct task_struct *tsk) +{ + if (radix_enabled()) + return radix__switch_mmu_context(prev, next); + return switch_slb(tsk, next); +} + extern int __init_new_context(void); extern void __destroy_context(int context_id); static inline void mmu_context_init(void) { } #else +extern void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk); extern unsigned long __init_new_context(void); extern void __destroy_context(unsigned long context_id); extern void mmu_context_init(void); @@ -88,17 +99,11 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, if (cpu_has_feature(CPU_FTR_ALTIVEC)) asm volatile ("dssall"); #endif /* CONFIG_ALTIVEC */ - - /* The actual HW switching method differs between the various - * sub architectures. + /* + * The actual HW switching method differs between the various + * sub architectures. Out of line for now */ -#ifdef CONFIG_PPC_STD_MMU_64 - switch_slb(tsk, next); -#else - /* Out of line for now */ - switch_mmu_context(prev, next); -#endif - + switch_mmu_context(prev, next, tsk); } #define deactivate_mm(tsk,mm) do { } while (0) diff --git a/arch/powerpc/include/asm/pgalloc-32.h b/arch/powerpc/include/asm/nohash/32/pgalloc.h similarity index 100% rename from arch/powerpc/include/asm/pgalloc-32.h rename to arch/powerpc/include/asm/nohash/32/pgalloc.h diff --git a/arch/powerpc/include/asm/pgalloc-64.h b/arch/powerpc/include/asm/nohash/64/pgalloc.h similarity index 68% rename from arch/powerpc/include/asm/pgalloc-64.h rename to arch/powerpc/include/asm/nohash/64/pgalloc.h index 8d5fc3ac43da..0c12a3bfe2ab 100644 --- a/arch/powerpc/include/asm/pgalloc-64.h +++ b/arch/powerpc/include/asm/nohash/64/pgalloc.h @@ -53,7 +53,7 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) #ifndef CONFIG_PPC_64K_PAGES -#define pgd_populate(MM, PGD, PUD) pgd_set(PGD, __pgtable_ptr_val(PUD)) +#define pgd_populate(MM, PGD, PUD) pgd_set(PGD, (unsigned long)PUD) static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) { @@ -68,19 +68,19 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud) static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) { - pud_set(pud, __pgtable_ptr_val(pmd)); + pud_set(pud, (unsigned long)pmd); } static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) { - pmd_set(pmd, __pgtable_ptr_val(pte)); + pmd_set(pmd, (unsigned long)pte); } static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, pgtable_t pte_page) { - pmd_set(pmd, __pgtable_ptr_val(page_address(pte_page))); + pmd_set(pmd, (unsigned long)page_address(pte_page)); } #define pmd_pgtable(pmd) pmd_page(pmd) @@ -119,119 +119,65 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage) __free_page(ptepage); } -static inline void pgtable_free(void *table, unsigned index_size) -{ - if (!index_size) - free_page((unsigned long)table); - else { - BUG_ON(index_size > MAX_PGTABLE_INDEX_SIZE); - kmem_cache_free(PGT_CACHE(index_size), table); - } -} - +extern void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift); #ifdef CONFIG_SMP -static inline void pgtable_free_tlb(struct mmu_gather *tlb, - void *table, int shift) -{ - unsigned long pgf = (unsigned long)table; - BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); - pgf |= shift; - tlb_remove_table(tlb, (void *)pgf); -} - -static inline void __tlb_remove_table(void *_table) -{ - void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE); - unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE; - - pgtable_free(table, shift); -} -#else /* !CONFIG_SMP */ -static inline void pgtable_free_tlb(struct mmu_gather *tlb, - void *table, int shift) -{ - pgtable_free(table, shift); -} -#endif /* CONFIG_SMP */ - +extern void __tlb_remove_table(void *_table); +#endif static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table, unsigned long address) { tlb_flush_pgtable(tlb, address); - pgtable_page_dtor(table); pgtable_free_tlb(tlb, page_address(table), 0); } #else /* if CONFIG_PPC_64K_PAGES */ -extern pte_t *page_table_alloc(struct mm_struct *, unsigned long, int); -extern void page_table_free(struct mm_struct *, unsigned long *, int); +extern pte_t *pte_fragment_alloc(struct mm_struct *, unsigned long, int); +extern void pte_fragment_free(unsigned long *, int); extern void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift); #ifdef CONFIG_SMP extern void __tlb_remove_table(void *_table); #endif -#ifndef __PAGETABLE_PUD_FOLDED -/* book3s 64 is 4 level page table */ -static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) -{ - pgd_set(pgd, __pgtable_ptr_val(pud)); -} - -static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) -{ - return kmem_cache_alloc(PGT_CACHE(PUD_INDEX_SIZE), - GFP_KERNEL|__GFP_REPEAT); -} - -static inline void pud_free(struct mm_struct *mm, pud_t *pud) -{ - kmem_cache_free(PGT_CACHE(PUD_INDEX_SIZE), pud); -} -#endif - -static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) -{ - pud_set(pud, __pgtable_ptr_val(pmd)); -} +#define pud_populate(mm, pud, pmd) pud_set(pud, (unsigned long)pmd) static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) { - pmd_set(pmd, __pgtable_ptr_val(pte)); + pmd_set(pmd, (unsigned long)pte); } static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, pgtable_t pte_page) { - pmd_set(pmd, __pgtable_ptr_val(pte_page)); + pmd_set(pmd, (unsigned long)pte_page); } static inline pgtable_t pmd_pgtable(pmd_t pmd) { - return (pgtable_t)pmd_page_vaddr(pmd); + return (pgtable_t)(pmd_val(pmd) & ~PMD_MASKED_BITS); } static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { - return (pte_t *)page_table_alloc(mm, address, 1); + return (pte_t *)pte_fragment_alloc(mm, address, 1); } static inline pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) { - return (pgtable_t)page_table_alloc(mm, address, 0); + return (pgtable_t)pte_fragment_alloc(mm, address, 0); } static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { - page_table_free(mm, (unsigned long *)pte, 1); + pte_fragment_fre((unsigned long *)pte, 1); } static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage) { - page_table_free(mm, (unsigned long *)ptepage, 0); + pte_fragment_free((unsigned long *)ptepage, 0); } static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table, @@ -255,11 +201,11 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) #define __pmd_free_tlb(tlb, pmd, addr) \ pgtable_free_tlb(tlb, pmd, PMD_CACHE_INDEX) -#ifndef __PAGETABLE_PUD_FOLDED +#ifndef CONFIG_PPC_64K_PAGES #define __pud_free_tlb(tlb, pud, addr) \ pgtable_free_tlb(tlb, pud, PUD_INDEX_SIZE) -#endif /* __PAGETABLE_PUD_FOLDED */ +#endif /* CONFIG_PPC_64K_PAGES */ #define check_pgt_cache() do { } while (0) diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index 10debb93c4a4..d4d808cf905e 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -108,9 +108,6 @@ #ifndef __ASSEMBLY__ /* pte_clear moved to later in this file */ -/* Pointers in the page table tree are virtual addresses */ -#define __pgtable_ptr_val(ptr) ((unsigned long)(ptr)) - #define PMD_BAD_BITS (PTE_TABLE_SIZE-1) #define PUD_BAD_BITS (PMD_TABLE_SIZE-1) @@ -362,6 +359,13 @@ static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry) void pgtable_cache_add(unsigned shift, void (*ctor)(void *)); void pgtable_cache_init(void); +extern int map_kernel_page(unsigned long ea, unsigned long pa, + unsigned long flags); +extern int __meminit vmemmap_create_mapping(unsigned long start, + unsigned long page_size, + unsigned long phys); +extern void vmemmap_remove_mapping(unsigned long start, + unsigned long page_size); #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_NOHASH_64_PGTABLE_H */ diff --git a/arch/powerpc/include/asm/nohash/pgalloc.h b/arch/powerpc/include/asm/nohash/pgalloc.h new file mode 100644 index 000000000000..b39ec956d71e --- /dev/null +++ b/arch/powerpc/include/asm/nohash/pgalloc.h @@ -0,0 +1,23 @@ +#ifndef _ASM_POWERPC_NOHASH_PGALLOC_H +#define _ASM_POWERPC_NOHASH_PGALLOC_H + +#include + +extern void tlb_remove_table(struct mmu_gather *tlb, void *table); +#ifdef CONFIG_PPC64 +extern void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address); +#else +/* 44x etc which is BOOKE not BOOK3E */ +static inline void tlb_flush_pgtable(struct mmu_gather *tlb, + unsigned long address) +{ + +} +#endif /* !CONFIG_PPC_BOOK3E */ + +#ifdef CONFIG_PPC64 +#include +#else +#include +#endif +#endif /* _ASM_POWERPC_NOHASH_PGALLOC_H */ diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index f8faaaeeca1e..9bb8ddf0be37 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -368,16 +368,16 @@ enum OpalLPCAddressType { }; enum opal_msg_type { - OPAL_MSG_ASYNC_COMP = 0, /* params[0] = token, params[1] = rc, + OPAL_MSG_ASYNC_COMP = 0, /* params[0] = token, params[1] = rc, * additional params function-specific */ - OPAL_MSG_MEM_ERR, - OPAL_MSG_EPOW, - OPAL_MSG_SHUTDOWN, /* params[0] = 1 reboot, 0 shutdown */ - OPAL_MSG_HMI_EVT, - OPAL_MSG_DPO, - OPAL_MSG_PRD, - OPAL_MSG_OCC, + OPAL_MSG_MEM_ERR = 1, + OPAL_MSG_EPOW = 2, + OPAL_MSG_SHUTDOWN = 3, /* params[0] = 1 reboot, 0 shutdown */ + OPAL_MSG_HMI_EVT = 4, + OPAL_MSG_DPO = 5, + OPAL_MSG_PRD = 6, + OPAL_MSG_OCC = 7, OPAL_MSG_TYPE_MAX, }; diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index ab3d8977bacd..51db3a37bced 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -288,7 +288,11 @@ extern long long virt_phys_offset; #ifndef __ASSEMBLY__ +#ifdef CONFIG_PPC_BOOK3S_64 +#include +#else #include +#endif typedef struct { signed long pd; } hugepd_t; @@ -312,12 +316,20 @@ void arch_free_page(struct page *page, int order); #endif struct vm_area_struct; - +#ifdef CONFIG_PPC_BOOK3S_64 +/* + * For BOOK3s 64 with 4k and 64K linux page size + * we want to use pointers, because the page table + * actually store pfn + */ +typedef pte_t *pgtable_t; +#else #if defined(CONFIG_PPC_64K_PAGES) && defined(CONFIG_PPC64) typedef pte_t *pgtable_t; #else typedef struct page *pgtable_t; #endif +#endif #include #endif /* __ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/page_64.h b/arch/powerpc/include/asm/page_64.h index d908a46d05c0..dd5f0712afa2 100644 --- a/arch/powerpc/include/asm/page_64.h +++ b/arch/powerpc/include/asm/page_64.h @@ -93,7 +93,7 @@ extern u64 ppc64_pft_size; #define SLICE_LOW_TOP (0x100000000ul) #define SLICE_NUM_LOW (SLICE_LOW_TOP >> SLICE_LOW_SHIFT) -#define SLICE_NUM_HIGH (PGTABLE_RANGE >> SLICE_HIGH_SHIFT) +#define SLICE_NUM_HIGH (H_PGTABLE_RANGE >> SLICE_HIGH_SHIFT) #define GET_LOW_SLICE_INDEX(addr) ((addr) >> SLICE_LOW_SHIFT) #define GET_HIGH_SLICE_INDEX(addr) ((addr) >> SLICE_HIGH_SHIFT) @@ -128,8 +128,6 @@ extern void slice_set_user_psize(struct mm_struct *mm, unsigned int psize); extern void slice_set_range_psize(struct mm_struct *mm, unsigned long start, unsigned long len, unsigned int psize); -#define slice_mm_new_context(mm) ((mm)->context.id == MMU_NO_CONTEXT) - #endif /* __ASSEMBLY__ */ #else #define slice_init() @@ -151,7 +149,6 @@ do { \ #define slice_set_range_psize(mm, start, len, psize) \ slice_set_user_psize((mm), (psize)) -#define slice_mm_new_context(mm) 1 #endif /* CONFIG_PPC_MM_SLICES */ #ifdef CONFIG_HUGETLB_PAGE diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index f5056e3394b4..467c0b05b6fb 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -17,33 +17,34 @@ struct device_node; * PCI controller operations */ struct pci_controller_ops { - void (*dma_dev_setup)(struct pci_dev *dev); + void (*dma_dev_setup)(struct pci_dev *pdev); void (*dma_bus_setup)(struct pci_bus *bus); - int (*probe_mode)(struct pci_bus *); + int (*probe_mode)(struct pci_bus *bus); /* Called when pci_enable_device() is called. Returns true to * allow assignment/enabling of the device. */ - bool (*enable_device_hook)(struct pci_dev *); + bool (*enable_device_hook)(struct pci_dev *pdev); - void (*disable_device)(struct pci_dev *); + void (*disable_device)(struct pci_dev *pdev); - void (*release_device)(struct pci_dev *); + void (*release_device)(struct pci_dev *pdev); /* Called during PCI resource reassignment */ - resource_size_t (*window_alignment)(struct pci_bus *, unsigned long type); - void (*reset_secondary_bus)(struct pci_dev *dev); + resource_size_t (*window_alignment)(struct pci_bus *bus, + unsigned long type); + void (*reset_secondary_bus)(struct pci_dev *pdev); #ifdef CONFIG_PCI_MSI - int (*setup_msi_irqs)(struct pci_dev *dev, + int (*setup_msi_irqs)(struct pci_dev *pdev, int nvec, int type); - void (*teardown_msi_irqs)(struct pci_dev *dev); + void (*teardown_msi_irqs)(struct pci_dev *pdev); #endif - int (*dma_set_mask)(struct pci_dev *dev, u64 dma_mask); - u64 (*dma_get_required_mask)(struct pci_dev *dev); + int (*dma_set_mask)(struct pci_dev *pdev, u64 dma_mask); + u64 (*dma_get_required_mask)(struct pci_dev *pdev); - void (*shutdown)(struct pci_controller *); + void (*shutdown)(struct pci_controller *hose); }; /* @@ -208,14 +209,14 @@ struct pci_dn { #ifdef CONFIG_EEH struct eeh_dev *edev; /* eeh device */ #endif -#define IODA_INVALID_PE (-1) +#define IODA_INVALID_PE 0xFFFFFFFF #ifdef CONFIG_PPC_POWERNV - int pe_number; + unsigned int pe_number; int vf_index; /* VF index in the PF */ #ifdef CONFIG_PCI_IOV u16 vfs_expanded; /* number of VFs IOV BAR expanded */ u16 num_vfs; /* number of VFs enabled*/ - int *pe_num_map; /* PE# for the first VF PE or array */ + unsigned int *pe_num_map; /* PE# for the first VF PE or array */ bool m64_single_mode; /* Use M64 BAR in Single Mode */ #define IODA_INVALID_M64 (-1) int (*m64_map)[PCI_SRIOV_NUM_BARS]; @@ -234,7 +235,9 @@ extern struct pci_dn *pci_get_pdn_by_devfn(struct pci_bus *bus, extern struct pci_dn *pci_get_pdn(struct pci_dev *pdev); extern struct pci_dn *add_dev_pci_data(struct pci_dev *pdev); extern void remove_dev_pci_data(struct pci_dev *pdev); -extern void *update_dn_pci_info(struct device_node *dn, void *data); +extern struct pci_dn *pci_add_device_node_info(struct pci_controller *hose, + struct device_node *dn); +extern void pci_remove_device_node_info(struct device_node *dn); static inline int pci_device_from_OF_node(struct device_node *np, u8 *bus, u8 *devfn) @@ -256,13 +259,13 @@ static inline struct eeh_dev *pdn_to_eeh_dev(struct pci_dn *pdn) #endif /** Find the bus corresponding to the indicated device node */ -extern struct pci_bus *pcibios_find_pci_bus(struct device_node *dn); +extern struct pci_bus *pci_find_bus_by_node(struct device_node *dn); /** Remove all of the PCI devices under this bus */ -extern void pcibios_remove_pci_devices(struct pci_bus *bus); +extern void pci_hp_remove_devices(struct pci_bus *bus); /** Discover new pci devices under this bus, and add them */ -extern void pcibios_add_pci_devices(struct pci_bus *bus); +extern void pci_hp_add_devices(struct pci_bus *bus); extern void isa_bridge_find_early(struct pci_controller *hose); diff --git a/arch/powerpc/include/asm/pgalloc.h b/arch/powerpc/include/asm/pgalloc.h index fc3ee06eab87..0413457ba11d 100644 --- a/arch/powerpc/include/asm/pgalloc.h +++ b/arch/powerpc/include/asm/pgalloc.h @@ -1,25 +1,12 @@ #ifndef _ASM_POWERPC_PGALLOC_H #define _ASM_POWERPC_PGALLOC_H -#ifdef __KERNEL__ #include -#ifdef CONFIG_PPC_BOOK3E -extern void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address); -#else /* CONFIG_PPC_BOOK3E */ -static inline void tlb_flush_pgtable(struct mmu_gather *tlb, - unsigned long address) -{ -} -#endif /* !CONFIG_PPC_BOOK3E */ - -extern void tlb_remove_table(struct mmu_gather *tlb, void *table); - -#ifdef CONFIG_PPC64 -#include +#ifdef CONFIG_PPC_BOOK3S +#include #else -#include +#include #endif -#endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_PGALLOC_H */ diff --git a/arch/powerpc/include/asm/pgtable-be-types.h b/arch/powerpc/include/asm/pgtable-be-types.h new file mode 100644 index 000000000000..e2bf208605b1 --- /dev/null +++ b/arch/powerpc/include/asm/pgtable-be-types.h @@ -0,0 +1,92 @@ +#ifndef _ASM_POWERPC_PGTABLE_BE_TYPES_H +#define _ASM_POWERPC_PGTABLE_BE_TYPES_H + +#include + +/* PTE level */ +typedef struct { __be64 pte; } pte_t; +#define __pte(x) ((pte_t) { cpu_to_be64(x) }) +static inline unsigned long pte_val(pte_t x) +{ + return be64_to_cpu(x.pte); +} + +static inline __be64 pte_raw(pte_t x) +{ + return x.pte; +} + +/* PMD level */ +#ifdef CONFIG_PPC64 +typedef struct { __be64 pmd; } pmd_t; +#define __pmd(x) ((pmd_t) { cpu_to_be64(x) }) +static inline unsigned long pmd_val(pmd_t x) +{ + return be64_to_cpu(x.pmd); +} + +static inline __be64 pmd_raw(pmd_t x) +{ + return x.pmd; +} + +/* + * 64 bit hash always use 4 level table. Everybody else use 4 level + * only for 4K page size. + */ +#if defined(CONFIG_PPC_BOOK3S_64) || !defined(CONFIG_PPC_64K_PAGES) +typedef struct { __be64 pud; } pud_t; +#define __pud(x) ((pud_t) { cpu_to_be64(x) }) +static inline unsigned long pud_val(pud_t x) +{ + return be64_to_cpu(x.pud); +} +#endif /* CONFIG_PPC_BOOK3S_64 || !CONFIG_PPC_64K_PAGES */ +#endif /* CONFIG_PPC64 */ + +/* PGD level */ +typedef struct { __be64 pgd; } pgd_t; +#define __pgd(x) ((pgd_t) { cpu_to_be64(x) }) +static inline unsigned long pgd_val(pgd_t x) +{ + return be64_to_cpu(x.pgd); +} + +/* Page protection bits */ +typedef struct { unsigned long pgprot; } pgprot_t; +#define pgprot_val(x) ((x).pgprot) +#define __pgprot(x) ((pgprot_t) { (x) }) + +/* + * With hash config 64k pages additionally define a bigger "real PTE" type that + * gathers the "second half" part of the PTE for pseudo 64k pages + */ +#if defined(CONFIG_PPC_64K_PAGES) && defined(CONFIG_PPC_STD_MMU_64) +typedef struct { pte_t pte; unsigned long hidx; } real_pte_t; +#else +typedef struct { pte_t pte; } real_pte_t; +#endif + +static inline bool pte_xchg(pte_t *ptep, pte_t old, pte_t new) +{ + unsigned long *p = (unsigned long *)ptep; + __be64 prev; + + prev = (__force __be64)__cmpxchg_u64(p, (__force unsigned long)pte_raw(old), + (__force unsigned long)pte_raw(new)); + + return pte_raw(old) == prev; +} + +static inline bool pmd_xchg(pmd_t *pmdp, pmd_t old, pmd_t new) +{ + unsigned long *p = (unsigned long *)pmdp; + __be64 prev; + + prev = (__force __be64)__cmpxchg_u64(p, (__force unsigned long)pmd_raw(old), + (__force unsigned long)pmd_raw(new)); + + return pmd_raw(old) == prev; +} + +#endif /* _ASM_POWERPC_PGTABLE_BE_TYPES_H */ diff --git a/arch/powerpc/include/asm/pgtable-types.h b/arch/powerpc/include/asm/pgtable-types.h index 43140f8b0592..e7f4f3e0fcde 100644 --- a/arch/powerpc/include/asm/pgtable-types.h +++ b/arch/powerpc/include/asm/pgtable-types.h @@ -1,9 +1,6 @@ #ifndef _ASM_POWERPC_PGTABLE_TYPES_H #define _ASM_POWERPC_PGTABLE_TYPES_H -#ifdef CONFIG_STRICT_MM_TYPECHECKS -/* These are used to make use of C type-checking. */ - /* PTE level */ typedef struct { pte_basic_t pte; } pte_t; #define __pte(x) ((pte_t) { (x) }) @@ -48,49 +45,6 @@ typedef struct { unsigned long pgprot; } pgprot_t; #define pgprot_val(x) ((x).pgprot) #define __pgprot(x) ((pgprot_t) { (x) }) -#else - -/* - * .. while these make it easier on the compiler - */ - -typedef pte_basic_t pte_t; -#define __pte(x) (x) -static inline pte_basic_t pte_val(pte_t pte) -{ - return pte; -} - -#ifdef CONFIG_PPC64 -typedef unsigned long pmd_t; -#define __pmd(x) (x) -static inline unsigned long pmd_val(pmd_t pmd) -{ - return pmd; -} - -#if defined(CONFIG_PPC_BOOK3S_64) || !defined(CONFIG_PPC_64K_PAGES) -typedef unsigned long pud_t; -#define __pud(x) (x) -static inline unsigned long pud_val(pud_t pud) -{ - return pud; -} -#endif /* CONFIG_PPC_BOOK3S_64 || !CONFIG_PPC_64K_PAGES */ -#endif /* CONFIG_PPC64 */ - -typedef unsigned long pgd_t; -#define __pgd(x) (x) -static inline unsigned long pgd_val(pgd_t pgd) -{ - return pgd; -} - -typedef unsigned long pgprot_t; -#define pgprot_val(x) (x) -#define __pgprot(x) (x) - -#endif /* CONFIG_STRICT_MM_TYPECHECKS */ /* * With hash config 64k pages additionally define a bigger "real PTE" type that * gathers the "second half" part of the PTE for pseudo 64k pages @@ -100,4 +54,16 @@ typedef struct { pte_t pte; unsigned long hidx; } real_pte_t; #else typedef struct { pte_t pte; } real_pte_t; #endif + +#ifdef CONFIG_PPC_STD_MMU_64 +#include + +static inline bool pte_xchg(pte_t *ptep, pte_t old, pte_t new) +{ + unsigned long *p = (unsigned long *)ptep; + + return pte_val(old) == __cmpxchg_u64(p, pte_val(old), pte_val(new)); +} +#endif + #endif /* _ASM_POWERPC_PGTABLE_TYPES_H */ diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index 7ab04fc59e24..1d035c1cc889 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -131,6 +131,7 @@ /* sorted alphabetically */ #define PPC_INST_BHRBE 0x7c00025c #define PPC_INST_CLRBHRB 0x7c00035c +#define PPC_INST_CP_ABORT 0x7c00068c #define PPC_INST_DCBA 0x7c0005ec #define PPC_INST_DCBA_MASK 0xfc0007fe #define PPC_INST_DCBAL 0x7c2005ec @@ -285,6 +286,7 @@ #endif /* Deal with instructions that older assemblers aren't aware of */ +#define PPC_CP_ABORT stringify_in_c(.long PPC_INST_CP_ABORT) #define PPC_DCBAL(a, b) stringify_in_c(.long PPC_INST_DCBAL | \ __PPC_RA(a) | __PPC_RB(b)) #define PPC_DCBZL(a, b) stringify_in_c(.long PPC_INST_DCBZL | \ diff --git a/arch/powerpc/include/asm/ppc-pci.h b/arch/powerpc/include/asm/ppc-pci.h index ca0c5bff7849..8753e4eb9ab5 100644 --- a/arch/powerpc/include/asm/ppc-pci.h +++ b/arch/powerpc/include/asm/ppc-pci.h @@ -33,9 +33,9 @@ extern struct pci_dev *isa_bridge_pcidev; /* may be NULL if no ISA bus */ struct device_node; struct pci_dn; -typedef void *(*traverse_func)(struct device_node *me, void *data); -void *traverse_pci_devices(struct device_node *start, traverse_func pre, - void *data); +void *pci_traverse_device_nodes(struct device_node *start, + void *(*fn)(struct device_node *, void *), + void *data); void *traverse_pci_dn(struct pci_dn *root, void *(*fn)(struct pci_dn *, void *), void *data); diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h index 499d9f89435a..2b31632376a5 100644 --- a/arch/powerpc/include/asm/ppc_asm.h +++ b/arch/powerpc/include/asm/ppc_asm.h @@ -427,7 +427,10 @@ END_FTR_SECTION_IFCLR(CPU_FTR_601) li r4,1024; \ mtctr r4; \ lis r4,KERNELBASE@h; \ + .machine push; \ + .machine "power4"; \ 0: tlbie r4; \ + .machine pop; \ addi r4,r4,0x1000; \ bdnz 0b #endif diff --git a/arch/powerpc/include/asm/pte-common.h b/arch/powerpc/include/asm/pte-common.h index 1ec67b043065..2eeaf80d41b7 100644 --- a/arch/powerpc/include/asm/pte-common.h +++ b/arch/powerpc/include/asm/pte-common.h @@ -76,6 +76,16 @@ */ #ifndef __ASSEMBLY__ extern unsigned long bad_call_to_PMD_PAGE_SIZE(void); + +/* + * Don't just check for any non zero bits in __PAGE_USER, since for book3e + * and PTE_64BIT, PAGE_KERNEL_X contains _PAGE_BAP_SR which is also in + * _PAGE_USER. Need to explicitly match _PAGE_BAP_UR bit in that case too. + */ +static inline bool pte_user(pte_t pte) +{ + return (pte_val(pte) & _PAGE_USER) == _PAGE_USER; +} #endif /* __ASSEMBLY__ */ /* Location of the PFN in the PTE. Most 32-bit platforms use the same @@ -184,13 +194,6 @@ extern unsigned long bad_call_to_PMD_PAGE_SIZE(void); /* Make modules code happy. We don't set RO yet */ #define PAGE_KERNEL_EXEC PAGE_KERNEL_X -/* - * Don't just check for any non zero bits in __PAGE_USER, since for book3e - * and PTE_64BIT, PAGE_KERNEL_X contains _PAGE_BAP_SR which is also in - * _PAGE_USER. Need to explicitly match _PAGE_BAP_UR bit in that case too. - */ -#define pte_user(val) ((val & _PAGE_USER) == _PAGE_USER) - /* Advertise special mapping type for AGP */ #define PAGE_AGP (PAGE_KERNEL_NC) #define HAVE_PAGE_AGP @@ -198,3 +201,12 @@ extern unsigned long bad_call_to_PMD_PAGE_SIZE(void); /* Advertise support for _PAGE_SPECIAL */ #define __HAVE_ARCH_PTE_SPECIAL +#ifndef _PAGE_READ +/* if not defined, we should not find _PAGE_WRITE too */ +#define _PAGE_READ 0 +#define _PAGE_WRITE _PAGE_RW +#endif + +#ifndef H_PAGE_4K_PFN +#define H_PAGE_4K_PFN 0 +#endif diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index f5f4c66bbbc9..c1e82e968506 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -347,6 +347,7 @@ #define LPCR_LPES_SH 2 #define LPCR_RMI 0x00000002 /* real mode is cache inhibit */ #define LPCR_HDICE 0x00000001 /* Hyp Decr enable (HV,PR,EE) */ +#define LPCR_UPRT 0x00400000 /* Use Process Table (ISA 3) */ #ifndef SPRN_LPID #define SPRN_LPID 0x13F /* Logical Partition Identifier */ #endif @@ -587,6 +588,7 @@ #define SPRN_PIR 0x3FF /* Processor Identification Register */ #endif #define SPRN_TIR 0x1BE /* Thread Identification Register */ +#define SPRN_PTCR 0x1D0 /* Partition table control Register */ #define SPRN_PSPB 0x09F /* Problem State Priority Boost reg */ #define SPRN_PTEHI 0x3D5 /* 981 7450 PTE HI word (S/W TLB load) */ #define SPRN_PTELO 0x3D6 /* 982 7450 PTE LO word (S/W TLB load) */ @@ -1182,6 +1184,7 @@ #define PVR_970GX 0x0045 #define PVR_POWER7p 0x004A #define PVR_POWER8E 0x004B +#define PVR_POWER8NVL 0x004C #define PVR_POWER8 0x004D #define PVR_BE 0x0070 #define PVR_PA6T 0x0090 diff --git a/arch/powerpc/include/asm/tlbflush.h b/arch/powerpc/include/asm/tlbflush.h index 9f77f85e3e99..1b38eea28e5a 100644 --- a/arch/powerpc/include/asm/tlbflush.h +++ b/arch/powerpc/include/asm/tlbflush.h @@ -58,6 +58,7 @@ extern void __flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, #elif defined(CONFIG_PPC_STD_MMU_32) +#define MMU_NO_CONTEXT (0) /* * TLB flushing for "classic" hash-MMU 32-bit CPUs, 6xx, 7xx, 7xxx */ @@ -78,7 +79,7 @@ static inline void local_flush_tlb_mm(struct mm_struct *mm) } #elif defined(CONFIG_PPC_STD_MMU_64) -#include +#include #else #error Unsupported MMU type #endif diff --git a/arch/powerpc/include/uapi/asm/perf_regs.h b/arch/powerpc/include/uapi/asm/perf_regs.h new file mode 100644 index 000000000000..6a93209748a1 --- /dev/null +++ b/arch/powerpc/include/uapi/asm/perf_regs.h @@ -0,0 +1,50 @@ +#ifndef _UAPI_ASM_POWERPC_PERF_REGS_H +#define _UAPI_ASM_POWERPC_PERF_REGS_H + +enum perf_event_powerpc_regs { + PERF_REG_POWERPC_R0, + PERF_REG_POWERPC_R1, + PERF_REG_POWERPC_R2, + PERF_REG_POWERPC_R3, + PERF_REG_POWERPC_R4, + PERF_REG_POWERPC_R5, + PERF_REG_POWERPC_R6, + PERF_REG_POWERPC_R7, + PERF_REG_POWERPC_R8, + PERF_REG_POWERPC_R9, + PERF_REG_POWERPC_R10, + PERF_REG_POWERPC_R11, + PERF_REG_POWERPC_R12, + PERF_REG_POWERPC_R13, + PERF_REG_POWERPC_R14, + PERF_REG_POWERPC_R15, + PERF_REG_POWERPC_R16, + PERF_REG_POWERPC_R17, + PERF_REG_POWERPC_R18, + PERF_REG_POWERPC_R19, + PERF_REG_POWERPC_R20, + PERF_REG_POWERPC_R21, + PERF_REG_POWERPC_R22, + PERF_REG_POWERPC_R23, + PERF_REG_POWERPC_R24, + PERF_REG_POWERPC_R25, + PERF_REG_POWERPC_R26, + PERF_REG_POWERPC_R27, + PERF_REG_POWERPC_R28, + PERF_REG_POWERPC_R29, + PERF_REG_POWERPC_R30, + PERF_REG_POWERPC_R31, + PERF_REG_POWERPC_NIP, + PERF_REG_POWERPC_MSR, + PERF_REG_POWERPC_ORIG_R3, + PERF_REG_POWERPC_CTR, + PERF_REG_POWERPC_LINK, + PERF_REG_POWERPC_XER, + PERF_REG_POWERPC_CCR, + PERF_REG_POWERPC_SOFTE, + PERF_REG_POWERPC_TRAP, + PERF_REG_POWERPC_DAR, + PERF_REG_POWERPC_DSISR, + PERF_REG_POWERPC_MAX, +}; +#endif /* _UAPI_ASM_POWERPC_PERF_REGS_H */ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index c9370d4e36bd..9ea09551a2cd 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -438,7 +438,11 @@ int main(void) DEFINE(BUG_ENTRY_SIZE, sizeof(struct bug_entry)); #endif +#ifdef MAX_PGD_TABLE_SIZE + DEFINE(PGD_TABLE_SIZE, MAX_PGD_TABLE_SIZE); +#else DEFINE(PGD_TABLE_SIZE, PGD_TABLE_SIZE); +#endif DEFINE(PTE_SIZE, sizeof(pte_t)); #ifdef CONFIG_KVM diff --git a/arch/powerpc/kernel/btext.c b/arch/powerpc/kernel/btext.c index 41c011cb6070..8275858a434d 100644 --- a/arch/powerpc/kernel/btext.c +++ b/arch/powerpc/kernel/btext.c @@ -162,7 +162,7 @@ void btext_map(void) offset = ((unsigned long) dispDeviceBase) - base; size = dispDeviceRowBytes * dispDeviceRect[3] + offset + dispDeviceRect[0]; - vbase = __ioremap(base, size, _PAGE_NO_CACHE); + vbase = __ioremap(base, size, pgprot_val(pgprot_noncached_wc(__pgprot(0)))); if (vbase == 0) return; logicalDisplayBase = vbase + offset; diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c index 6c662b8de90d..eeeacf6235a3 100644 --- a/arch/powerpc/kernel/cputable.c +++ b/arch/powerpc/kernel/cputable.c @@ -63,7 +63,6 @@ extern void __setup_cpu_745x(unsigned long offset, struct cpu_spec* spec); extern void __setup_cpu_ppc970(unsigned long offset, struct cpu_spec* spec); extern void __setup_cpu_ppc970MP(unsigned long offset, struct cpu_spec* spec); extern void __setup_cpu_pa6t(unsigned long offset, struct cpu_spec* spec); -extern void __setup_cpu_a2(unsigned long offset, struct cpu_spec* spec); extern void __restore_cpu_pa6t(void); extern void __restore_cpu_ppc970(void); extern void __setup_cpu_power7(unsigned long offset, struct cpu_spec* spec); @@ -72,7 +71,6 @@ extern void __setup_cpu_power8(unsigned long offset, struct cpu_spec* spec); extern void __restore_cpu_power8(void); extern void __setup_cpu_power9(unsigned long offset, struct cpu_spec* spec); extern void __restore_cpu_power9(void); -extern void __restore_cpu_a2(void); extern void __flush_tlb_power7(unsigned int action); extern void __flush_tlb_power8(unsigned int action); extern void __flush_tlb_power9(unsigned int action); diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 6544017eb90b..c9bc78e9c610 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -48,7 +48,7 @@ /** Overview: - * EEH, or "Extended Error Handling" is a PCI bridge technology for + * EEH, or "Enhanced Error Handling" is a PCI bridge technology for * dealing with PCI bus errors that can't be dealt with within the * usual PCI framework, except by check-stopping the CPU. Systems * that are designed for high-availability/reliability cannot afford @@ -1068,7 +1068,7 @@ void eeh_add_device_early(struct pci_dn *pdn) struct pci_controller *phb; struct eeh_dev *edev = pdn_to_eeh_dev(pdn); - if (!edev || !eeh_enabled()) + if (!edev) return; if (!eeh_has_flag(EEH_PROBE_MODE_DEVTREE)) @@ -1336,14 +1336,11 @@ static int eeh_pe_change_owner(struct eeh_pe *pe) id->subdevice != pdev->subsystem_device) continue; - goto reset; + return eeh_pe_reset_and_recover(pe); } } return eeh_unfreeze_pe(pe, true); - -reset: - return eeh_pe_reset_and_recover(pe); } /** diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index fb6207d2c604..2714a3b81d24 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -171,6 +171,16 @@ static void *eeh_dev_save_state(void *data, void *userdata) if (!edev) return NULL; + /* + * We cannot access the config space on some adapters. + * Otherwise, it will cause fenced PHB. We don't save + * the content in their config space and will restore + * from the initial config space saved when the EEH + * device is created. + */ + if (edev->pe && (edev->pe->state & EEH_PE_CFG_RESTRICTED)) + return NULL; + pdev = eeh_dev_to_pci_dev(edev); if (!pdev) return NULL; @@ -312,6 +322,19 @@ static void *eeh_dev_restore_state(void *data, void *userdata) if (!edev) return NULL; + /* + * The content in the config space isn't saved because + * the blocked config space on some adapters. We have + * to restore the initial saved config space when the + * EEH device is created. + */ + if (edev->pe && (edev->pe->state & EEH_PE_CFG_RESTRICTED)) { + if (list_is_last(&edev->list, &edev->pe->edevs)) + eeh_pe_restore_bars(edev->pe); + + return NULL; + } + pdev = eeh_dev_to_pci_dev(edev); if (!pdev) return NULL; @@ -552,7 +575,7 @@ static int eeh_clear_pe_frozen_state(struct eeh_pe *pe, int eeh_pe_reset_and_recover(struct eeh_pe *pe) { - int result, ret; + int ret; /* Bail if the PE is being recovered */ if (pe->state & EEH_PE_RECOVERING) @@ -564,9 +587,6 @@ int eeh_pe_reset_and_recover(struct eeh_pe *pe) /* Save states */ eeh_pe_dev_traverse(pe, eeh_dev_save_state, NULL); - /* Report error */ - eeh_pe_dev_traverse(pe, eeh_report_error, &result); - /* Issue reset */ ret = eeh_reset_pe(pe); if (ret) { @@ -581,15 +601,9 @@ int eeh_pe_reset_and_recover(struct eeh_pe *pe) return ret; } - /* Notify completion of reset */ - eeh_pe_dev_traverse(pe, eeh_report_reset, &result); - /* Restore device state */ eeh_pe_dev_traverse(pe, eeh_dev_restore_state, NULL); - /* Resume */ - eeh_pe_dev_traverse(pe, eeh_report_resume, NULL); - /* Clear recovery mode */ eeh_pe_state_clear(pe, EEH_PE_RECOVERING); @@ -621,7 +635,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, * We don't remove the corresponding PE instances because * we need the information afterwords. The attached EEH * devices are expected to be attached soon when calling - * into pcibios_add_pci_devices(). + * into pci_hp_add_devices(). */ eeh_pe_state_mark(pe, EEH_PE_KEEP); if (bus) { @@ -630,7 +644,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, } else { eeh_pe_state_clear(pe, EEH_PE_PRI_BUS); pci_lock_rescan_remove(); - pcibios_remove_pci_devices(bus); + pci_hp_remove_devices(bus); pci_unlock_rescan_remove(); } } else if (frozen_bus) { @@ -681,7 +695,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, if (pe->type & EEH_PE_VF) eeh_add_virt_device(edev, NULL); else - pcibios_add_pci_devices(bus); + pci_hp_add_devices(bus); } else if (frozen_bus && rmv_data->removed) { pr_info("EEH: Sleep 5s ahead of partial hotplug\n"); ssleep(5); @@ -691,7 +705,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, if (pe->type & EEH_PE_VF) eeh_add_virt_device(edev, NULL); else - pcibios_add_pci_devices(frozen_bus); + pci_hp_add_devices(frozen_bus); } eeh_pe_state_clear(pe, EEH_PE_KEEP); @@ -896,7 +910,7 @@ static void eeh_handle_normal_event(struct eeh_pe *pe) eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); pci_lock_rescan_remove(); - pcibios_remove_pci_devices(frozen_bus); + pci_hp_remove_devices(frozen_bus); pci_unlock_rescan_remove(); } } @@ -981,7 +995,7 @@ static void eeh_handle_special_event(void) bus = eeh_pe_bus_get(phb_pe); eeh_pe_dev_traverse(pe, eeh_report_failure, NULL); - pcibios_remove_pci_devices(bus); + pci_hp_remove_devices(bus); } pci_unlock_rescan_remove(); } diff --git a/arch/powerpc/kernel/eeh_event.c b/arch/powerpc/kernel/eeh_event.c index 4eefb6e34dbb..82e7327e3cd0 100644 --- a/arch/powerpc/kernel/eeh_event.c +++ b/arch/powerpc/kernel/eeh_event.c @@ -36,7 +36,7 @@ static DEFINE_SPINLOCK(eeh_eventlist_lock); static struct semaphore eeh_eventlist_sem; -LIST_HEAD(eeh_eventlist); +static LIST_HEAD(eeh_eventlist); /** * eeh_event_handler - Dispatch EEH events. diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c index eea48d8baf49..f0520da85759 100644 --- a/arch/powerpc/kernel/eeh_pe.c +++ b/arch/powerpc/kernel/eeh_pe.c @@ -249,7 +249,7 @@ static void *__eeh_pe_get(void *data, void *flag) } else { if (edev->pe_config_addr && (edev->pe_config_addr == pe->addr)) - return pe; + return pe; } /* Try BDF address */ diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 39a79c89a4b6..73e461a3dfbb 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -37,6 +37,7 @@ #include #include #include +#include /* * System calls. @@ -509,6 +510,14 @@ BEGIN_FTR_SECTION ldarx r6,0,r1 END_FTR_SECTION_IFSET(CPU_FTR_STCX_CHECKS_ADDRESS) +BEGIN_FTR_SECTION +/* + * A cp_abort (copy paste abort) here ensures that when context switching, a + * copy from one process can't leak into the paste of another. + */ + PPC_CP_ABORT +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) + #ifdef CONFIG_PPC_BOOK3S /* Cancel all explict user streams as they will have no use after context * switch and will stop the HW from creating streams itself @@ -520,7 +529,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_STCX_CHECKS_ADDRESS) std r6,PACACURRENT(r13) /* Set new 'current' */ ld r8,KSP(r4) /* new stack pointer */ -#ifdef CONFIG_PPC_BOOK3S +#ifdef CONFIG_PPC_STD_MMU_64 +BEGIN_MMU_FTR_SECTION + b 2f +END_MMU_FTR_SECTION_IFSET(MMU_FTR_RADIX) BEGIN_FTR_SECTION clrrdi r6,r8,28 /* get its ESID */ clrrdi r9,r1,28 /* get current sp ESID */ @@ -566,7 +578,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) slbmte r7,r0 isync 2: -#endif /* !CONFIG_PPC_BOOK3S */ +#endif /* CONFIG_PPC_STD_MMU_64 */ CURRENT_THREAD_INFO(r7, r8) /* base of new stack */ /* Note: this uses SWITCH_FRAME_SIZE rather than INT_FRAME_SIZE diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 7716cebf4b8e..4c9440629128 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -189,7 +189,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) #endif /* CONFIG_PPC_P7_NAP */ EXCEPTION_PROLOG_0(PACA_EXMC) BEGIN_FTR_SECTION - b machine_check_pSeries_early + b machine_check_powernv_early FTR_SECTION_ELSE b machine_check_pSeries_0 ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) @@ -209,11 +209,6 @@ data_access_slb_pSeries: EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST, 0x380) std r3,PACA_EXSLB+EX_R3(r13) mfspr r3,SPRN_DAR -#ifdef __DISABLED__ - /* Keep that around for when we re-implement dynamic VSIDs */ - cmpdi r3,0 - bge slb_miss_user_pseries -#endif /* __DISABLED__ */ mfspr r12,SPRN_SRR1 #ifndef CONFIG_RELOCATABLE b slb_miss_realmode @@ -240,11 +235,6 @@ instruction_access_slb_pSeries: EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST, 0x480) std r3,PACA_EXSLB+EX_R3(r13) mfspr r3,SPRN_SRR0 /* SRR0 is faulting address */ -#ifdef __DISABLED__ - /* Keep that around for when we re-implement dynamic VSIDs */ - cmpdi r3,0 - bge slb_miss_user_pseries -#endif /* __DISABLED__ */ mfspr r12,SPRN_SRR1 #ifndef CONFIG_RELOCATABLE b slb_miss_realmode @@ -443,7 +433,7 @@ denorm_exception_hv: .align 7 /* moved from 0x200 */ -machine_check_pSeries_early: +machine_check_powernv_early: BEGIN_FTR_SECTION EXCEPTION_PROLOG_1(PACA_EXMC, NOTEST, 0x200) /* @@ -709,34 +699,6 @@ system_reset_fwnmi: #endif /* CONFIG_PPC_PSERIES */ -#ifdef __DISABLED__ -/* - * This is used for when the SLB miss handler has to go virtual, - * which doesn't happen for now anymore but will once we re-implement - * dynamic VSIDs for shared page tables - */ -slb_miss_user_pseries: - std r10,PACA_EXGEN+EX_R10(r13) - std r11,PACA_EXGEN+EX_R11(r13) - std r12,PACA_EXGEN+EX_R12(r13) - GET_SCRATCH0(r10) - ld r11,PACA_EXSLB+EX_R9(r13) - ld r12,PACA_EXSLB+EX_R3(r13) - std r10,PACA_EXGEN+EX_R13(r13) - std r11,PACA_EXGEN+EX_R9(r13) - std r12,PACA_EXGEN+EX_R3(r13) - clrrdi r12,r13,32 - mfmsr r10 - mfspr r11,SRR0 /* save SRR0 */ - ori r12,r12,slb_miss_user_common@l /* virt addr of handler */ - ori r10,r10,MSR_IR|MSR_DR|MSR_RI - mtspr SRR0,r12 - mfspr r12,SRR1 /* and SRR1 */ - mtspr SRR1,r10 - rfid - b . /* prevent spec. execution */ -#endif /* __DISABLED__ */ - #ifdef CONFIG_KVM_BOOK3S_64_HANDLER kvmppc_skip_interrupt: /* @@ -764,11 +726,10 @@ kvmppc_skip_Hinterrupt: #endif /* - * Code from here down to __end_handlers is invoked from the - * exception prologs above. Because the prologs assemble the - * addresses of these handlers using the LOAD_HANDLER macro, - * which uses an ori instruction, these handlers must be in - * the first 64k of the kernel image. + * Ensure that any handlers that get invoked from the exception prologs + * above are below the first 64KB (0x10000) of the kernel image because + * the prologs assemble the addresses of these handlers using the + * LOAD_HANDLER macro, which uses an ori instruction. */ /*** Common interrupt handlers ***/ @@ -953,11 +914,6 @@ hv_facility_unavailable_relon_trampoline: #endif STD_RELON_EXCEPTION_PSERIES(0x5700, 0x1700, altivec_assist) - /* Other future vectors */ - .align 7 - .globl __end_interrupts -__end_interrupts: - .align 7 system_call_entry: b system_call_common @@ -983,7 +939,13 @@ data_access_common: ld r3,PACA_EXGEN+EX_DAR(r13) lwz r4,PACA_EXGEN+EX_DSISR(r13) li r5,0x300 + std r3,_DAR(r1) + std r4,_DSISR(r1) +BEGIN_MMU_FTR_SECTION b do_hash_page /* Try to handle as hpte fault */ +MMU_FTR_SECTION_ELSE + b handle_page_fault +ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_RADIX) .align 7 .globl h_data_storage_common @@ -1008,74 +970,16 @@ instruction_access_common: ld r3,_NIP(r1) andis. r4,r12,0x5820 li r5,0x400 + std r3,_DAR(r1) + std r4,_DSISR(r1) +BEGIN_MMU_FTR_SECTION b do_hash_page /* Try to handle as hpte fault */ +MMU_FTR_SECTION_ELSE + b handle_page_fault +ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_RADIX) STD_EXCEPTION_COMMON(0xe20, h_instr_storage, unknown_exception) -/* - * Here is the common SLB miss user that is used when going to virtual - * mode for SLB misses, that is currently not used - */ -#ifdef __DISABLED__ - .align 7 - .globl slb_miss_user_common -slb_miss_user_common: - mflr r10 - std r3,PACA_EXGEN+EX_DAR(r13) - stw r9,PACA_EXGEN+EX_CCR(r13) - std r10,PACA_EXGEN+EX_LR(r13) - std r11,PACA_EXGEN+EX_SRR0(r13) - bl slb_allocate_user - - ld r10,PACA_EXGEN+EX_LR(r13) - ld r3,PACA_EXGEN+EX_R3(r13) - lwz r9,PACA_EXGEN+EX_CCR(r13) - ld r11,PACA_EXGEN+EX_SRR0(r13) - mtlr r10 - beq- slb_miss_fault - - andi. r10,r12,MSR_RI /* check for unrecoverable exception */ - beq- unrecov_user_slb - mfmsr r10 - -.machine push -.machine "power4" - mtcrf 0x80,r9 -.machine pop - - clrrdi r10,r10,2 /* clear RI before setting SRR0/1 */ - mtmsrd r10,1 - - mtspr SRR0,r11 - mtspr SRR1,r12 - - ld r9,PACA_EXGEN+EX_R9(r13) - ld r10,PACA_EXGEN+EX_R10(r13) - ld r11,PACA_EXGEN+EX_R11(r13) - ld r12,PACA_EXGEN+EX_R12(r13) - ld r13,PACA_EXGEN+EX_R13(r13) - rfid - b . - -slb_miss_fault: - EXCEPTION_PROLOG_COMMON(0x380, PACA_EXGEN) - ld r4,PACA_EXGEN+EX_DAR(r13) - li r5,0 - std r4,_DAR(r1) - std r5,_DSISR(r1) - b handle_page_fault - -unrecov_user_slb: - EXCEPTION_PROLOG_COMMON(0x4200, PACA_EXGEN) - RECONCILE_IRQ_STATE(r10, r11) - bl save_nvgprs -1: addi r3,r1,STACK_FRAME_OVERHEAD - bl unrecoverable_exception - b 1b - -#endif /* __DISABLED__ */ - - /* * Machine check is different because we use a different * save area: PACA_EXMC instead of PACA_EXGEN. @@ -1230,10 +1134,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX) STD_EXCEPTION_COMMON(0xf60, facility_unavailable, facility_unavailable_exception) STD_EXCEPTION_COMMON(0xf80, hv_facility_unavailable, facility_unavailable_exception) - .align 7 - .globl __end_handlers -__end_handlers: - /* Equivalents to the above handlers for relocation-on interrupt vectors */ STD_RELON_EXCEPTION_HV_OOL(0xe40, emulation_assist) MASKABLE_RELON_EXCEPTION_HV_OOL(0xe80, h_doorbell) @@ -1244,6 +1144,17 @@ __end_handlers: STD_RELON_EXCEPTION_PSERIES_OOL(0xf60, facility_unavailable) STD_RELON_EXCEPTION_HV_OOL(0xf80, hv_facility_unavailable) + /* + * The __end_interrupts marker must be past the out-of-line (OOL) + * handlers, so that they are copied to real address 0x100 when running + * a relocatable kernel. This ensures they can be reached from the short + * trampoline handlers (like 0x4f00, 0x4f20, etc.) which branch + * directly, without using LOAD_HANDLER(). + */ + .align 7 + .globl __end_interrupts +__end_interrupts: + #if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) /* * Data area reserved for FWNMI option. @@ -1476,8 +1387,11 @@ slb_miss_realmode: stw r9,PACA_EXSLB+EX_CCR(r13) /* save CR in exc. frame */ std r10,PACA_EXSLB+EX_LR(r13) /* save LR */ +#ifdef CONFIG_PPC_STD_MMU_64 +BEGIN_MMU_FTR_SECTION bl slb_allocate_realmode - +END_MMU_FTR_SECTION_IFCLR(MMU_FTR_RADIX) +#endif /* All done -- return from exception. */ ld r10,PACA_EXSLB+EX_LR(r13) @@ -1485,7 +1399,9 @@ slb_miss_realmode: lwz r9,PACA_EXSLB+EX_CCR(r13) /* get saved CR */ mtlr r10 - +BEGIN_MMU_FTR_SECTION + b 2f +END_MMU_FTR_SECTION_IFSET(MMU_FTR_RADIX) andi. r10,r12,MSR_RI /* check for unrecoverable exception */ beq- 2f @@ -1536,9 +1452,7 @@ power4_fixup_nap: */ .align 7 do_hash_page: - std r3,_DAR(r1) - std r4,_DSISR(r1) - +#ifdef CONFIG_PPC_STD_MMU_64 andis. r0,r4,0xa410 /* weird error? */ bne- handle_page_fault /* if not, try to insert a HPTE */ andis. r0,r4,DSISR_DABRMATCH@h @@ -1566,6 +1480,7 @@ do_hash_page: /* Error */ blt- 13f +#endif /* CONFIG_PPC_STD_MMU_64 */ /* Here we have a page fault that hash_page can't handle. */ handle_page_fault: @@ -1592,6 +1507,7 @@ handle_dabr_fault: 12: b ret_from_except_lite +#ifdef CONFIG_PPC_STD_MMU_64 /* We have a page fault that hash_page could handle but HV refused * the PTE insertion */ @@ -1601,6 +1517,7 @@ handle_dabr_fault: ld r4,_DAR(r1) bl low_hash_fault b ret_from_except +#endif /* * We come here as a result of a DSI at a point where we don't want diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c index 9dac18dabd03..1123a4d8d8dd 100644 --- a/arch/powerpc/kernel/ftrace.c +++ b/arch/powerpc/kernel/ftrace.c @@ -607,3 +607,13 @@ unsigned long __init arch_syscall_addr(int nr) return sys_call_table[nr*2]; } #endif /* CONFIG_FTRACE_SYSCALLS && CONFIG_PPC64 */ + +#if defined(CONFIG_PPC64) && (!defined(_CALL_ELF) || _CALL_ELF != 2) +char *arch_ftrace_match_adjust(char *str, const char *search) +{ + if (str[0] == '.' && search[0] != '.') + return str + 1; + else + return str; +} +#endif /* defined(CONFIG_PPC64) && (!defined(_CALL_ELF) || _CALL_ELF != 2) */ diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index 4286775cbde9..2d14774af6b4 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -973,13 +973,16 @@ start_here_common: * This stuff goes at the beginning of the bss, which is page-aligned. */ .section ".bss" - - .align PAGE_SHIFT - - .globl empty_zero_page -empty_zero_page: - .space PAGE_SIZE +/* + * pgd dir should be aligned to PGD_TABLE_SIZE which is 64K. + * We will need to find a better way to fix this + */ + .align 16 .globl swapper_pg_dir swapper_pg_dir: .space PGD_TABLE_SIZE + + .globl empty_zero_page +empty_zero_page: + .space PAGE_SIZE diff --git a/arch/powerpc/kernel/ibmebus.c b/arch/powerpc/kernel/ibmebus.c index ac86c53e2542..a89f4f7a66bd 100644 --- a/arch/powerpc/kernel/ibmebus.c +++ b/arch/powerpc/kernel/ibmebus.c @@ -408,7 +408,7 @@ static ssize_t modalias_show(struct device *dev, return len+1; } -struct device_attribute ibmebus_bus_device_attrs[] = { +static struct device_attribute ibmebus_bus_device_attrs[] = { __ATTR_RO(devspec), __ATTR_RO(name), __ATTR_RO(modalias), diff --git a/arch/powerpc/kernel/isa-bridge.c b/arch/powerpc/kernel/isa-bridge.c index 0f1997097960..ae1316106e2b 100644 --- a/arch/powerpc/kernel/isa-bridge.c +++ b/arch/powerpc/kernel/isa-bridge.c @@ -109,14 +109,14 @@ static void pci_process_ISA_OF_ranges(struct device_node *isa_node, size = 0x10000; __ioremap_at(phb_io_base_phys, (void *)ISA_IO_BASE, - size, _PAGE_NO_CACHE|_PAGE_GUARDED); + size, pgprot_val(pgprot_noncached(__pgprot(0)))); return; inval_range: printk(KERN_ERR "no ISA IO ranges or unexpected isa range, " "mapping 64k\n"); __ioremap_at(phb_io_base_phys, (void *)ISA_IO_BASE, - 0x10000, _PAGE_NO_CACHE|_PAGE_GUARDED); + 0x10000, pgprot_val(pgprot_noncached(__pgprot(0)))); } diff --git a/arch/powerpc/kernel/machine_kexec.c b/arch/powerpc/kernel/machine_kexec.c index 015ae55c1868..2694d078741d 100644 --- a/arch/powerpc/kernel/machine_kexec.c +++ b/arch/powerpc/kernel/machine_kexec.c @@ -228,17 +228,12 @@ static struct property memory_limit_prop = { static void __init export_crashk_values(struct device_node *node) { - struct property *prop; - /* There might be existing crash kernel properties, but we can't * be sure what's in them, so remove them. */ - prop = of_find_property(node, "linux,crashkernel-base", NULL); - if (prop) - of_remove_property(node, prop); - - prop = of_find_property(node, "linux,crashkernel-size", NULL); - if (prop) - of_remove_property(node, prop); + of_remove_property(node, of_find_property(node, + "linux,crashkernel-base", NULL)); + of_remove_property(node, of_find_property(node, + "linux,crashkernel-size", NULL)); if (crashk_res.start != 0) { crashk_base = cpu_to_be_ulong(crashk_res.start), @@ -258,16 +253,13 @@ static void __init export_crashk_values(struct device_node *node) static int __init kexec_setup(void) { struct device_node *node; - struct property *prop; node = of_find_node_by_path("/chosen"); if (!node) return -ENOENT; /* remove any stale properties so ours can be found */ - prop = of_find_property(node, kernel_end_prop.name, NULL); - if (prop) - of_remove_property(node, prop); + of_remove_property(node, of_find_property(node, kernel_end_prop.name, NULL)); /* information needed by userspace when using default_machine_kexec */ kernel_end = cpu_to_be_ulong(__pa(_end)); diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c index 0fbd75d185d7..b8c202d63ecb 100644 --- a/arch/powerpc/kernel/machine_kexec_64.c +++ b/arch/powerpc/kernel/machine_kexec_64.c @@ -76,6 +76,7 @@ int default_machine_kexec_prepare(struct kimage *image) * end of the blocked region (begin >= high). Use the * boolean identity !(a || b) === (!a && !b). */ +#ifdef CONFIG_PPC_STD_MMU_64 if (htab_address) { low = __pa(htab_address); high = low + htab_size_bytes; @@ -88,6 +89,7 @@ int default_machine_kexec_prepare(struct kimage *image) return -ETXTBSY; } } +#endif /* CONFIG_PPC_STD_MMU_64 */ /* We also should not overwrite the tce tables */ for_each_node_by_type(node, "pci") { @@ -381,7 +383,7 @@ void default_machine_kexec(struct kimage *image) /* NOTREACHED */ } -#ifndef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_STD_MMU_64 /* Values we need to export to the second kernel via the device tree. */ static unsigned long htab_base; static unsigned long htab_size; @@ -401,7 +403,6 @@ static struct property htab_size_prop = { static int __init export_htab_values(void) { struct device_node *node; - struct property *prop; /* On machines with no htab htab_address is NULL */ if (!htab_address) @@ -412,12 +413,8 @@ static int __init export_htab_values(void) return -ENODEV; /* remove any stale propertys so ours can be found */ - prop = of_find_property(node, htab_base_prop.name, NULL); - if (prop) - of_remove_property(node, prop); - prop = of_find_property(node, htab_size_prop.name, NULL); - if (prop) - of_remove_property(node, prop); + of_remove_property(node, of_find_property(node, htab_base_prop.name, NULL)); + of_remove_property(node, of_find_property(node, htab_size_prop.name, NULL)); htab_base = cpu_to_be64(__pa(htab_address)); of_add_property(node, &htab_base_prop); @@ -428,4 +425,4 @@ static int __init export_htab_values(void) return 0; } late_initcall(export_htab_values); -#endif /* !CONFIG_PPC_BOOK3E */ +#endif /* CONFIG_PPC_STD_MMU_64 */ diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index 671fd5122406..ef267fd9dd22 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -37,7 +37,7 @@ static DEFINE_PER_CPU(int, mce_queue_count); static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event_queue); static void machine_check_process_queued_event(struct irq_work *work); -struct irq_work mce_event_process_work = { +static struct irq_work mce_event_process_work = { .func = machine_check_process_queued_event, }; diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c index ee62b197502d..7353991c4ece 100644 --- a/arch/powerpc/kernel/mce_power.c +++ b/arch/powerpc/kernel/mce_power.c @@ -72,11 +72,15 @@ void __flush_tlb_power8(unsigned int action) void __flush_tlb_power9(unsigned int action) { + if (radix_enabled()) + flush_tlb_206(POWER9_TLB_SETS_RADIX, action); + flush_tlb_206(POWER9_TLB_SETS_HASH, action); } /* flush SLBs and reload */ +#ifdef CONFIG_PPC_STD_MMU_64 static void flush_and_reload_slb(void) { struct slb_shadow *slb; @@ -110,6 +114,7 @@ static void flush_and_reload_slb(void) asm volatile("slbmte %0,%1" : : "r" (rs), "r" (rb)); } } +#endif static long mce_handle_derror(uint64_t dsisr, uint64_t slb_error_bits) { @@ -120,6 +125,7 @@ static long mce_handle_derror(uint64_t dsisr, uint64_t slb_error_bits) * reset the error bits whenever we handle them so that at the end * we can check whether we handled all of them or not. * */ +#ifdef CONFIG_PPC_STD_MMU_64 if (dsisr & slb_error_bits) { flush_and_reload_slb(); /* reset error bits */ @@ -131,6 +137,7 @@ static long mce_handle_derror(uint64_t dsisr, uint64_t slb_error_bits) /* reset error bits */ dsisr &= ~P7_DSISR_MC_TLB_MULTIHIT_MFTLB; } +#endif /* Any other errors we don't understand? */ if (dsisr & 0xffffffffUL) handled = 0; @@ -150,6 +157,7 @@ static long mce_handle_common_ierror(uint64_t srr1) switch (P7_SRR1_MC_IFETCH(srr1)) { case 0: break; +#ifdef CONFIG_PPC_STD_MMU_64 case P7_SRR1_MC_IFETCH_SLB_PARITY: case P7_SRR1_MC_IFETCH_SLB_MULTIHIT: /* flush and reload SLBs for SLB errors. */ @@ -162,6 +170,7 @@ static long mce_handle_common_ierror(uint64_t srr1) handled = 1; } break; +#endif default: break; } @@ -175,10 +184,12 @@ static long mce_handle_ierror_p7(uint64_t srr1) handled = mce_handle_common_ierror(srr1); +#ifdef CONFIG_PPC_STD_MMU_64 if (P7_SRR1_MC_IFETCH(srr1) == P7_SRR1_MC_IFETCH_SLB_BOTH) { flush_and_reload_slb(); handled = 1; } +#endif return handled; } @@ -321,10 +332,12 @@ static long mce_handle_ierror_p8(uint64_t srr1) handled = mce_handle_common_ierror(srr1); +#ifdef CONFIG_PPC_STD_MMU_64 if (P7_SRR1_MC_IFETCH(srr1) == P8_SRR1_MC_IFETCH_ERAT_MULTIHIT) { flush_and_reload_slb(); handled = 1; } +#endif return handled; } diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S index bf5160fbf9d8..285ca8c6cc2e 100644 --- a/arch/powerpc/kernel/misc_32.S +++ b/arch/powerpc/kernel/misc_32.S @@ -599,12 +599,6 @@ _GLOBAL(__bswapdi2) mr r4,r10 blr -_GLOBAL(abs) - srawi r4,r3,31 - xor r3,r3,r4 - sub r3,r3,r4 - blr - #ifdef CONFIG_SMP _GLOBAL(start_secondary_resume) /* Reset stack */ diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c index 0cab9e8c3794..856f9a7944cd 100644 --- a/arch/powerpc/kernel/nvram_64.c +++ b/arch/powerpc/kernel/nvram_64.c @@ -15,8 +15,6 @@ * parsing code. */ -#include - #include #include #include @@ -1231,12 +1229,4 @@ static int __init nvram_init(void) return rc; } - -static void __exit nvram_cleanup(void) -{ - misc_deregister( &nvram_dev ); -} - -module_init(nvram_init); -module_exit(nvram_cleanup); -MODULE_LICENSE("GPL"); +device_initcall(nvram_init); diff --git a/arch/powerpc/kernel/pci-hotplug.c b/arch/powerpc/kernel/pci-hotplug.c index 59c436189f46..2d71269e7dc1 100644 --- a/arch/powerpc/kernel/pci-hotplug.c +++ b/arch/powerpc/kernel/pci-hotplug.c @@ -21,6 +21,35 @@ #include #include +static struct pci_bus *find_bus_among_children(struct pci_bus *bus, + struct device_node *dn) +{ + struct pci_bus *child = NULL; + struct pci_bus *tmp; + + if (pci_bus_to_OF_node(bus) == dn) + return bus; + + list_for_each_entry(tmp, &bus->children, node) { + child = find_bus_among_children(tmp, dn); + if (child) + break; + } + + return child; +} + +struct pci_bus *pci_find_bus_by_node(struct device_node *dn) +{ + struct pci_dn *pdn = PCI_DN(dn); + + if (!pdn || !pdn->phb || !pdn->phb->bus) + return NULL; + + return find_bus_among_children(pdn->phb->bus, dn); +} +EXPORT_SYMBOL_GPL(pci_find_bus_by_node); + /** * pcibios_release_device - release PCI device * @dev: PCI device @@ -38,20 +67,20 @@ void pcibios_release_device(struct pci_dev *dev) } /** - * pcibios_remove_pci_devices - remove all devices under this bus + * pci_hp_remove_devices - remove all devices under this bus * @bus: the indicated PCI bus * * Remove all of the PCI devices under this bus both from the * linux pci device tree, and from the powerpc EEH address cache. */ -void pcibios_remove_pci_devices(struct pci_bus *bus) +void pci_hp_remove_devices(struct pci_bus *bus) { struct pci_dev *dev, *tmp; struct pci_bus *child_bus; /* First go down child busses */ list_for_each_entry(child_bus, &bus->children, node) - pcibios_remove_pci_devices(child_bus); + pci_hp_remove_devices(child_bus); pr_debug("PCI: Removing devices on bus %04x:%02x\n", pci_domain_nr(bus), bus->number); @@ -60,11 +89,10 @@ void pcibios_remove_pci_devices(struct pci_bus *bus) pci_stop_and_remove_bus_device(dev); } } - -EXPORT_SYMBOL_GPL(pcibios_remove_pci_devices); +EXPORT_SYMBOL_GPL(pci_hp_remove_devices); /** - * pcibios_add_pci_devices - adds new pci devices to bus + * pci_hp_add_devices - adds new pci devices to bus * @bus: the indicated PCI bus * * This routine will find and fixup new pci devices under @@ -74,7 +102,7 @@ EXPORT_SYMBOL_GPL(pcibios_remove_pci_devices); * is how this routine differs from other, similar pcibios * routines.) */ -void pcibios_add_pci_devices(struct pci_bus * bus) +void pci_hp_add_devices(struct pci_bus *bus) { int slotno, mode, pass, max; struct pci_dev *dev; @@ -92,7 +120,8 @@ void pcibios_add_pci_devices(struct pci_bus * bus) if (mode == PCI_PROBE_DEVTREE) { /* use ofdt-based probe */ of_rescan_bus(dn, bus); - } else if (mode == PCI_PROBE_NORMAL) { + } else if (mode == PCI_PROBE_NORMAL && + dn->child && PCI_DN(dn->child)) { /* * Use legacy probe. In the partial hotplug case, we * probably have grandchildren devices unplugged. So @@ -114,4 +143,4 @@ void pcibios_add_pci_devices(struct pci_bus * bus) } pcibios_finish_adding_to_bus(bus); } -EXPORT_SYMBOL_GPL(pcibios_add_pci_devices); +EXPORT_SYMBOL_GPL(pci_hp_add_devices); diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c index 60bb187cb46a..3759df52bd67 100644 --- a/arch/powerpc/kernel/pci_64.c +++ b/arch/powerpc/kernel/pci_64.c @@ -38,7 +38,7 @@ * ISA drivers use hard coded offsets. If no ISA bus exists nothing * is mapped on the first 64K of IO space */ -unsigned long pci_io_base = ISA_IO_BASE; +unsigned long pci_io_base; EXPORT_SYMBOL(pci_io_base); static int __init pcibios_init(void) @@ -47,6 +47,7 @@ static int __init pcibios_init(void) printk(KERN_INFO "PCI: Probing PCI hardware\n"); + pci_io_base = ISA_IO_BASE; /* For now, override phys_mem_access_prot. If we need it,g * later, we may move that initialization to each ppc_md */ @@ -159,7 +160,7 @@ static int pcibios_map_phb_io_space(struct pci_controller *hose) /* Establish the mapping */ if (__ioremap_at(phys_page, area->addr, size_page, - _PAGE_NO_CACHE | _PAGE_GUARDED) == NULL) + pgprot_val(pgprot_noncached(__pgprot(0)))) == NULL) return -ENOMEM; /* Fixup hose IO resource */ diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c index 38102cb9baa9..ecdccce78719 100644 --- a/arch/powerpc/kernel/pci_dn.c +++ b/arch/powerpc/kernel/pci_dn.c @@ -282,13 +282,9 @@ void remove_dev_pci_data(struct pci_dev *pdev) #endif /* CONFIG_PCI_IOV */ } -/* - * Traverse_func that inits the PCI fields of the device node. - * NOTE: this *must* be done before read/write config to the device. - */ -void *update_dn_pci_info(struct device_node *dn, void *data) +struct pci_dn *pci_add_device_node_info(struct pci_controller *hose, + struct device_node *dn) { - struct pci_controller *phb = data; const __be32 *type = of_get_property(dn, "ibm,pci-config-space-type", NULL); const __be32 *regs; struct device_node *parent; @@ -299,7 +295,7 @@ void *update_dn_pci_info(struct device_node *dn, void *data) return NULL; dn->data = pdn; pdn->node = dn; - pdn->phb = phb; + pdn->phb = hose; #ifdef CONFIG_PPC_POWERNV pdn->pe_number = IODA_INVALID_PE; #endif @@ -331,8 +327,32 @@ void *update_dn_pci_info(struct device_node *dn, void *data) if (pdn->parent) list_add_tail(&pdn->list, &pdn->parent->child_list); - return NULL; + return pdn; } +EXPORT_SYMBOL_GPL(pci_add_device_node_info); + +void pci_remove_device_node_info(struct device_node *dn) +{ + struct pci_dn *pdn = dn ? PCI_DN(dn) : NULL; +#ifdef CONFIG_EEH + struct eeh_dev *edev = pdn_to_eeh_dev(pdn); + + if (edev) + edev->pdn = NULL; +#endif + + if (!pdn) + return; + + WARN_ON(!list_empty(&pdn->child_list)); + list_del(&pdn->list); + if (pdn->parent) + of_node_put(pdn->parent->node); + + dn->data = NULL; + kfree(pdn); +} +EXPORT_SYMBOL_GPL(pci_remove_device_node_info); /* * Traverse a device tree stopping each PCI device in the tree. @@ -352,8 +372,9 @@ void *update_dn_pci_info(struct device_node *dn, void *data) * one of these nodes we also assume its siblings are non-pci for * performance. */ -void *traverse_pci_devices(struct device_node *start, traverse_func pre, - void *data) +void *pci_traverse_device_nodes(struct device_node *start, + void *(*fn)(struct device_node *, void *), + void *data) { struct device_node *dn, *nextdn; void *ret; @@ -368,8 +389,11 @@ void *traverse_pci_devices(struct device_node *start, traverse_func pre, if (classp) class = of_read_number(classp, 1); - if (pre && ((ret = pre(dn, data)) != NULL)) - return ret; + if (fn) { + ret = fn(dn, data); + if (ret) + return ret; + } /* If we are a PCI bridge, go down */ if (dn->child && ((class >> 8) == PCI_CLASS_BRIDGE_PCI || @@ -391,6 +415,7 @@ void *traverse_pci_devices(struct device_node *start, traverse_func pre, } return NULL; } +EXPORT_SYMBOL_GPL(pci_traverse_device_nodes); static struct pci_dn *pci_dn_next_one(struct pci_dn *root, struct pci_dn *pdn) @@ -432,6 +457,18 @@ void *traverse_pci_dn(struct pci_dn *root, return NULL; } +static void *add_pdn(struct device_node *dn, void *data) +{ + struct pci_controller *hose = data; + struct pci_dn *pdn; + + pdn = pci_add_device_node_info(hose, dn); + if (!pdn) + return ERR_PTR(-ENOMEM); + + return NULL; +} + /** * pci_devs_phb_init_dynamic - setup pci devices under this PHB * phb: pci-to-host bridge (top-level bridge connecting to cpu) @@ -446,8 +483,7 @@ void pci_devs_phb_init_dynamic(struct pci_controller *phb) struct pci_dn *pdn; /* PHB nodes themselves must not match */ - update_dn_pci_info(dn, phb); - pdn = dn->data; + pdn = pci_add_device_node_info(phb, dn); if (pdn) { pdn->devfn = pdn->busno = -1; pdn->vendor_id = pdn->device_id = pdn->class_code = 0; @@ -456,7 +492,7 @@ void pci_devs_phb_init_dynamic(struct pci_controller *phb) } /* Update dn->phb ptrs for new phb and children devices */ - traverse_pci_devices(dn, update_dn_pci_info, phb); + pci_traverse_device_nodes(dn, add_pdn, phb); } /** diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 2a9280b945e0..ea8a28fd6f31 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -55,6 +56,7 @@ #include #endif #include +#include #include #include @@ -1077,7 +1079,7 @@ struct task_struct *__switch_to(struct task_struct *prev, } #endif /* CONFIG_PPC64 */ -#ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_STD_MMU_64 batch = this_cpu_ptr(&ppc64_tlb_batch); if (batch->active) { current_thread_info()->local_flags |= _TLF_LAZY_MMU; @@ -1085,7 +1087,7 @@ struct task_struct *__switch_to(struct task_struct *prev, __flush_tlb_pending(batch); batch->active = 0; } -#endif /* CONFIG_PPC_BOOK3S_64 */ +#endif /* CONFIG_PPC_STD_MMU_64 */ #ifdef CONFIG_PPC_ADV_DEBUG_REGS switch_booke_debug_regs(&new->thread.debug); @@ -1131,7 +1133,7 @@ struct task_struct *__switch_to(struct task_struct *prev, last = _switch(old_thread, new_thread); -#ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_STD_MMU_64 if (current_thread_info()->local_flags & _TLF_LAZY_MMU) { current_thread_info()->local_flags &= ~_TLF_LAZY_MMU; batch = this_cpu_ptr(&ppc64_tlb_batch); @@ -1140,8 +1142,7 @@ struct task_struct *__switch_to(struct task_struct *prev, if (current_thread_info()->task->thread.regs) restore_math(current_thread_info()->task->thread.regs); - -#endif /* CONFIG_PPC_BOOK3S_64 */ +#endif /* CONFIG_PPC_STD_MMU_64 */ return last; } @@ -1376,6 +1377,9 @@ static void setup_ksp_vsid(struct task_struct *p, unsigned long sp) unsigned long sp_vsid; unsigned long llp = mmu_psize_defs[mmu_linear_psize].sllp; + if (radix_enabled()) + return; + if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) sp_vsid = get_kernel_vsid(sp, MMU_SEGSIZE_1T) << SLB_VSID_SHIFT_1T; @@ -1924,7 +1928,8 @@ unsigned long arch_randomize_brk(struct mm_struct *mm) * the heap, we can put it above 1TB so it is backed by a 1TB * segment. Otherwise the heap will be in the bottom 1TB * which always uses 256MB segments and this may result in a - * performance penalty. + * performance penalty. We don't need to worry about radix. For + * radix, mmu_highuser_ssize remains unchanged from 256MB. */ if (!is_32bit_task() && (mmu_highuser_ssize == MMU_SEGSIZE_1T)) base = max_t(unsigned long, mm->brk, 1UL << SID_SHIFT_1T); diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index a15fe1d4e84a..946e34ffeae9 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -167,6 +168,7 @@ static struct ibm_pa_feature { */ {CPU_FTR_TM_COMP, 0, 0, PPC_FEATURE2_HTM_COMP|PPC_FEATURE2_HTM_NOSC_COMP, 22, 0, 0}, + {0, MMU_FTR_RADIX, 0, 0, 40, 0, 0}, }; static void __init scan_features(unsigned long node, const unsigned char *ftrs, diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c index aa610ce8742f..c638e2487a9c 100644 --- a/arch/powerpc/kernel/rtasd.c +++ b/arch/powerpc/kernel/rtasd.c @@ -442,7 +442,7 @@ static void do_event_scan(void) } static void rtas_event_scan(struct work_struct *w); -DECLARE_DELAYED_WORK(event_scan_work, rtas_event_scan); +static DECLARE_DELAYED_WORK(event_scan_work, rtas_event_scan); /* * Delay should be at least one second since some machines have problems if diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 44c8d03558ac..8ca79b7503d8 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -128,9 +128,7 @@ void machine_restart(char *cmd) machine_shutdown(); if (ppc_md.restart) ppc_md.restart(cmd); -#ifdef CONFIG_SMP smp_send_stop(); -#endif printk(KERN_EMERG "System Halted, OK to turn off power\n"); local_irq_disable(); while (1) ; @@ -141,9 +139,7 @@ void machine_power_off(void) machine_shutdown(); if (pm_power_off) pm_power_off(); -#ifdef CONFIG_SMP smp_send_stop(); -#endif printk(KERN_EMERG "System Halted, OK to turn off power\n"); local_irq_disable(); while (1) ; @@ -159,9 +155,7 @@ void machine_halt(void) machine_shutdown(); if (ppc_md.halt) ppc_md.halt(); -#ifdef CONFIG_SMP smp_send_stop(); -#endif printk(KERN_EMERG "System Halted, OK to turn off power\n"); local_irq_disable(); while (1) ; diff --git a/arch/powerpc/kernel/swsusp.c b/arch/powerpc/kernel/swsusp.c index 6669b1752512..6ae9bd5086a4 100644 --- a/arch/powerpc/kernel/swsusp.c +++ b/arch/powerpc/kernel/swsusp.c @@ -31,6 +31,6 @@ void save_processor_state(void) void restore_processor_state(void) { #ifdef CONFIG_PPC32 - switch_mmu_context(current->active_mm, current->active_mm); + switch_mmu_context(current->active_mm, current->active_mm, NULL); #endif } diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 81b0900a39ee..3ed9a5a21d77 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -55,6 +55,7 @@ #include #include #include +#include #include #include diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c index 5f8dcdaa2820..8d7358f3a273 100644 --- a/arch/powerpc/kernel/vio.c +++ b/arch/powerpc/kernel/vio.c @@ -87,7 +87,7 @@ struct vio_cmo_dev_entry { * @curr: bytes currently allocated * @high: high water mark for IO data usage */ -struct vio_cmo { +static struct vio_cmo { spinlock_t lock; struct delayed_work balance_q; struct list_head device_list; @@ -615,7 +615,7 @@ static u64 vio_dma_get_required_mask(struct device *dev) return dma_iommu_ops.get_required_mask(dev); } -struct dma_map_ops vio_dma_mapping_ops = { +static struct dma_map_ops vio_dma_mapping_ops = { .alloc = vio_dma_iommu_alloc_coherent, .free = vio_dma_iommu_free_coherent, .mmap = dma_direct_mmap_coherent, diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index c7b78d8336b2..05f09ae82587 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -447,7 +447,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, struct revmap_entry *rev; struct page *page, *pages[1]; long index, ret, npages; - unsigned long is_io; + bool is_ci; unsigned int writing, write_ok; struct vm_area_struct *vma; unsigned long rcbits; @@ -503,7 +503,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, smp_rmb(); ret = -EFAULT; - is_io = 0; + is_ci = false; pfn = 0; page = NULL; pte_size = PAGE_SIZE; @@ -521,7 +521,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, pfn = vma->vm_pgoff + ((hva - vma->vm_start) >> PAGE_SHIFT); pte_size = psize; - is_io = hpte_cache_bits(pgprot_val(vma->vm_page_prot)); + is_ci = pte_ci(__pte((pgprot_val(vma->vm_page_prot)))); write_ok = vma->vm_flags & VM_WRITE; } up_read(¤t->mm->mmap_sem); @@ -558,10 +558,9 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, goto out_put; /* Check WIMG vs. the actual page we're accessing */ - if (!hpte_cache_flags_ok(r, is_io)) { - if (is_io) + if (!hpte_cache_flags_ok(r, is_ci)) { + if (is_ci) goto out_put; - /* * Allow guest to map emulated device memory as * uncacheable, but actually make it cacheable. diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 93243554cae9..e20beae5ca7a 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3272,6 +3272,12 @@ static int kvmppc_core_check_processor_compat_hv(void) if (!cpu_has_feature(CPU_FTR_HVMODE) || !cpu_has_feature(CPU_FTR_ARCH_206)) return -EIO; + /* + * Disable KVM for Power9, untill the required bits merged. + */ + if (cpu_has_feature(CPU_FTR_ARCH_300)) + return -EIO; + return 0; } diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 4cb8db05f3e5..99b4e9d5dd23 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -175,7 +175,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, unsigned long g_ptel; struct kvm_memory_slot *memslot; unsigned hpage_shift; - unsigned long is_io; + bool is_ci; unsigned long *rmap; pte_t *ptep; unsigned int writing; @@ -199,7 +199,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, gfn = gpa >> PAGE_SHIFT; memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn); pa = 0; - is_io = ~0ul; + is_ci = false; rmap = NULL; if (!(memslot && !(memslot->flags & KVM_MEMSLOT_INVALID))) { /* Emulated MMIO - mark this with key=31 */ @@ -250,7 +250,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, if (writing && !pte_write(pte)) /* make the actual HPTE be read-only */ ptel = hpte_make_readonly(ptel); - is_io = hpte_cache_bits(pte_val(pte)); + is_ci = pte_ci(pte); pa = pte_pfn(pte) << PAGE_SHIFT; pa |= hva & (host_pte_size - 1); pa |= gpa & ~PAGE_MASK; @@ -267,9 +267,9 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, else pteh |= HPTE_V_ABSENT; - /* Check WIMG */ - if (is_io != ~0ul && !hpte_cache_flags_ok(ptel, is_io)) { - if (is_io) + /*If we had host pte mapping then Check WIMG */ + if (ptep && !hpte_cache_flags_ok(ptel, is_ci)) { + if (is_ci) return H_PARAMETER; /* * Allow guest to map emulated device memory as diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 8129b0db131e..8e4f64f0b774 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -1713,7 +1713,11 @@ static void kvmppc_core_destroy_vm_pr(struct kvm *kvm) static int kvmppc_core_check_processor_compat_pr(void) { - /* we are always compatible */ + /* + * Disable KVM for Power9 untill the required bits merged. + */ + if (cpu_has_feature(CPU_FTR_ARCH_300)) + return -EIO; return 0; } diff --git a/arch/powerpc/lib/copy_32.S b/arch/powerpc/lib/copy_32.S index c44df2dbedd5..99f37f24185c 100644 --- a/arch/powerpc/lib/copy_32.S +++ b/arch/powerpc/lib/copy_32.S @@ -217,7 +217,7 @@ _GLOBAL(memcpy) bdnz 40b 65: blr -_GLOBAL(generic_memcpy) +generic_memcpy: srwi. r7,r5,3 addi r6,r3,-4 addi r4,r4,-4 diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index dc885b30f7a6..3362299b1859 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -925,6 +925,7 @@ int __kprobes analyse_instr(struct instruction_op *op, struct pt_regs *regs, } } #endif + break; /* illegal instruction */ case 31: switch ((instr >> 1) & 0x3ff) { @@ -1818,9 +1819,11 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr) case 4: __get_user_asmx(val, op.ea, err, "lwarx"); break; +#ifdef __powerpc64__ case 8: __get_user_asmx(val, op.ea, err, "ldarx"); break; +#endif default: return 0; } @@ -1841,9 +1844,11 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr) case 4: __put_user_asmx(op.val, op.ea, err, "stwcx.", cr); break; +#ifdef __powerpc64__ case 8: __put_user_asmx(op.val, op.ea, err, "stdcx.", cr); break; +#endif default: return 0; } diff --git a/arch/powerpc/lib/xor_vmx.c b/arch/powerpc/lib/xor_vmx.c index 07f49f1568e5..f9de69a04e88 100644 --- a/arch/powerpc/lib/xor_vmx.c +++ b/arch/powerpc/lib/xor_vmx.c @@ -17,7 +17,17 @@ * * Author: Anton Blanchard */ + +/* + * Sparse (as at v0.5.0) gets very, very confused by this file. + * Make it a bit simpler for it. + */ +#if !defined(__CHECKER__) #include +#else +#define vec_xor(a, b) a ^ b +#define vector __attribute__((vector_size(16))) +#endif #include #include diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index adfee3f1aeb9..f2cea6d5e764 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -13,10 +13,11 @@ obj-$(CONFIG_PPC_MMU_NOHASH) += mmu_context_nohash.o tlb_nohash.o \ tlb_nohash_low.o obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(CONFIG_WORD_SIZE)e.o hash64-$(CONFIG_PPC_NATIVE) := hash_native_64.o -obj-$(CONFIG_PPC_STD_MMU_64) += hash_utils_64.o slb_low.o slb.o $(hash64-y) -obj-$(CONFIG_PPC_STD_MMU_32) += ppc_mmu_32.o hash_low_32.o -obj-$(CONFIG_PPC_STD_MMU) += tlb_hash$(CONFIG_WORD_SIZE).o \ - mmu_context_hash$(CONFIG_WORD_SIZE).o +obj-$(CONFIG_PPC_BOOK3E_64) += pgtable-book3e.o +obj-$(CONFIG_PPC_STD_MMU_64) += pgtable-hash64.o hash_utils_64.o slb_low.o slb.o $(hash64-y) mmu_context_book3s64.o pgtable-book3s64.o +obj-$(CONFIG_PPC_RADIX_MMU) += pgtable-radix.o tlb-radix.o +obj-$(CONFIG_PPC_STD_MMU_32) += ppc_mmu_32.o hash_low_32.o mmu_context_hash32.o +obj-$(CONFIG_PPC_STD_MMU) += tlb_hash$(CONFIG_WORD_SIZE).o ifeq ($(CONFIG_PPC_STD_MMU_64),y) obj-$(CONFIG_PPC_4K_PAGES) += hash64_4k.o obj-$(CONFIG_PPC_64K_PAGES) += hash64_64k.o @@ -33,6 +34,7 @@ obj-$(CONFIG_PPC_MM_SLICES) += slice.o obj-y += hugetlbpage.o ifeq ($(CONFIG_HUGETLB_PAGE),y) obj-$(CONFIG_PPC_STD_MMU_64) += hugetlbpage-hash64.o +obj-$(CONFIG_PPC_RADIX_MMU) += hugetlbpage-radix.o obj-$(CONFIG_PPC_BOOK3E_MMU) += hugetlbpage-book3e.o endif obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hugepage-hash64.o diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/fsl_booke_mmu.c index a1b2713f6e96..139dec421e57 100644 --- a/arch/powerpc/mm/fsl_booke_mmu.c +++ b/arch/powerpc/mm/fsl_booke_mmu.c @@ -135,7 +135,7 @@ static void settlbcam(int index, unsigned long virt, phys_addr_t phys, TLBCAM[index].MAS7 = (u64)phys >> 32; /* Below is unlikely -- only for large user pages or similar */ - if (pte_user(flags)) { + if (pte_user(__pte(flags))) { TLBCAM[index].MAS3 |= MAS3_UX | MAS3_UR; TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_UW : 0); } diff --git a/arch/powerpc/mm/hash64_4k.c b/arch/powerpc/mm/hash64_4k.c index 47d1b26effc6..6333b273d2d5 100644 --- a/arch/powerpc/mm/hash64_4k.c +++ b/arch/powerpc/mm/hash64_4k.c @@ -34,21 +34,21 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, old_pte = pte_val(pte); /* If PTE busy, retry the access */ - if (unlikely(old_pte & _PAGE_BUSY)) + if (unlikely(old_pte & H_PAGE_BUSY)) return 0; /* If PTE permissions don't match, take page fault */ - if (unlikely(access & ~old_pte)) + if (unlikely(!check_pte_access(access, old_pte))) return 1; /* * Try to lock the PTE, add ACCESSED and DIRTY if it was * a write access. Since this is 4K insert of 64K page size - * also add _PAGE_COMBO + * also add H_PAGE_COMBO */ - new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED; - if (access & _PAGE_RW) + new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED; + if (access & _PAGE_WRITE) new_pte |= _PAGE_DIRTY; - } while (old_pte != __cmpxchg_u64((unsigned long *)ptep, - old_pte, new_pte)); + } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); + /* * PP bits. _PAGE_USER is already PP bit 0x2, so we only * need to add in 0x1 if it's a read-only user page @@ -60,22 +60,22 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); vpn = hpt_vpn(ea, vsid, ssize); - if (unlikely(old_pte & _PAGE_HASHPTE)) { + if (unlikely(old_pte & H_PAGE_HASHPTE)) { /* * There MIGHT be an HPTE for this pte */ hash = hpt_hash(vpn, shift, ssize); - if (old_pte & _PAGE_F_SECOND) + if (old_pte & H_PAGE_F_SECOND) hash = ~hash; slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; - slot += (old_pte & _PAGE_F_GIX) >> _PAGE_F_GIX_SHIFT; + slot += (old_pte & H_PAGE_F_GIX) >> H_PAGE_F_GIX_SHIFT; if (ppc_md.hpte_updatepp(slot, rflags, vpn, MMU_PAGE_4K, MMU_PAGE_4K, ssize, flags) == -1) old_pte &= ~_PAGE_HPTEFLAGS; } - if (likely(!(old_pte & _PAGE_HASHPTE))) { + if (likely(!(old_pte & H_PAGE_HASHPTE))) { pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; hash = hpt_hash(vpn, shift, ssize); @@ -115,9 +115,10 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, MMU_PAGE_4K, MMU_PAGE_4K, old_pte); return -1; } - new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE; - new_pte |= (slot << _PAGE_F_GIX_SHIFT) & (_PAGE_F_SECOND | _PAGE_F_GIX); + new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE; + new_pte |= (slot << H_PAGE_F_GIX_SHIFT) & + (H_PAGE_F_SECOND | H_PAGE_F_GIX); } - *ptep = __pte(new_pte & ~_PAGE_BUSY); + *ptep = __pte(new_pte & ~H_PAGE_BUSY); return 0; } diff --git a/arch/powerpc/mm/hash64_64k.c b/arch/powerpc/mm/hash64_64k.c index b2d659cf51c6..16644e1f4e6b 100644 --- a/arch/powerpc/mm/hash64_64k.c +++ b/arch/powerpc/mm/hash64_64k.c @@ -23,7 +23,7 @@ bool __rpte_sub_valid(real_pte_t rpte, unsigned long index) unsigned long g_idx; unsigned long ptev = pte_val(rpte.pte); - g_idx = (ptev & _PAGE_COMBO_VALID) >> _PAGE_F_GIX_SHIFT; + g_idx = (ptev & H_PAGE_COMBO_VALID) >> H_PAGE_F_GIX_SHIFT; index = index >> 2; if (g_idx & (0x1 << index)) return true; @@ -37,12 +37,12 @@ static unsigned long mark_subptegroup_valid(unsigned long ptev, unsigned long in { unsigned long g_idx; - if (!(ptev & _PAGE_COMBO)) + if (!(ptev & H_PAGE_COMBO)) return ptev; index = index >> 2; g_idx = 0x1 << index; - return ptev | (g_idx << _PAGE_F_GIX_SHIFT); + return ptev | (g_idx << H_PAGE_F_GIX_SHIFT); } int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, @@ -66,21 +66,21 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, old_pte = pte_val(pte); /* If PTE busy, retry the access */ - if (unlikely(old_pte & _PAGE_BUSY)) + if (unlikely(old_pte & H_PAGE_BUSY)) return 0; /* If PTE permissions don't match, take page fault */ - if (unlikely(access & ~old_pte)) + if (unlikely(!check_pte_access(access, old_pte))) return 1; /* * Try to lock the PTE, add ACCESSED and DIRTY if it was * a write access. Since this is 4K insert of 64K page size - * also add _PAGE_COMBO + * also add H_PAGE_COMBO */ - new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED | _PAGE_COMBO; - if (access & _PAGE_RW) + new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED | H_PAGE_COMBO; + if (access & _PAGE_WRITE) new_pte |= _PAGE_DIRTY; - } while (old_pte != __cmpxchg_u64((unsigned long *)ptep, - old_pte, new_pte)); + } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); + /* * Handle the subpage protection bits */ @@ -103,21 +103,21 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, /* *None of the sub 4k page is hashed */ - if (!(old_pte & _PAGE_HASHPTE)) + if (!(old_pte & H_PAGE_HASHPTE)) goto htab_insert_hpte; /* * Check if the pte was already inserted into the hash table * as a 64k HW page, and invalidate the 64k HPTE if so. */ - if (!(old_pte & _PAGE_COMBO)) { + if (!(old_pte & H_PAGE_COMBO)) { flush_hash_page(vpn, rpte, MMU_PAGE_64K, ssize, flags); /* * clear the old slot details from the old and new pte. * On hash insert failure we use old pte value and we don't * want slot information there if we have a insert failure. */ - old_pte &= ~(_PAGE_HASHPTE | _PAGE_F_GIX | _PAGE_F_SECOND); - new_pte &= ~(_PAGE_HASHPTE | _PAGE_F_GIX | _PAGE_F_SECOND); + old_pte &= ~(H_PAGE_HASHPTE | H_PAGE_F_GIX | H_PAGE_F_SECOND); + new_pte &= ~(H_PAGE_HASHPTE | H_PAGE_F_GIX | H_PAGE_F_SECOND); goto htab_insert_hpte; } /* @@ -143,15 +143,15 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, if (ret == -1) goto htab_insert_hpte; - *ptep = __pte(new_pte & ~_PAGE_BUSY); + *ptep = __pte(new_pte & ~H_PAGE_BUSY); return 0; } htab_insert_hpte: /* - * handle _PAGE_4K_PFN case + * handle H_PAGE_4K_PFN case */ - if (old_pte & _PAGE_4K_PFN) { + if (old_pte & H_PAGE_4K_PFN) { /* * All the sub 4k page have the same * physical address. @@ -199,20 +199,20 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, } /* * Insert slot number & secondary bit in PTE second half, - * clear _PAGE_BUSY and set appropriate HPTE slot bit - * Since we have _PAGE_BUSY set on ptep, we can be sure + * clear H_PAGE_BUSY and set appropriate HPTE slot bit + * Since we have H_PAGE_BUSY set on ptep, we can be sure * nobody is undating hidx. */ hidxp = (unsigned long *)(ptep + PTRS_PER_PTE); rpte.hidx &= ~(0xfUL << (subpg_index << 2)); *hidxp = rpte.hidx | (slot << (subpg_index << 2)); new_pte = mark_subptegroup_valid(new_pte, subpg_index); - new_pte |= _PAGE_HASHPTE; + new_pte |= H_PAGE_HASHPTE; /* * check __real_pte for details on matching smp_rmb() */ smp_wmb(); - *ptep = __pte(new_pte & ~_PAGE_BUSY); + *ptep = __pte(new_pte & ~H_PAGE_BUSY); return 0; } @@ -220,7 +220,6 @@ int __hash_page_64K(unsigned long ea, unsigned long access, unsigned long vsid, pte_t *ptep, unsigned long trap, unsigned long flags, int ssize) { - unsigned long hpte_group; unsigned long rflags, pa; unsigned long old_pte, new_pte; @@ -235,27 +234,26 @@ int __hash_page_64K(unsigned long ea, unsigned long access, old_pte = pte_val(pte); /* If PTE busy, retry the access */ - if (unlikely(old_pte & _PAGE_BUSY)) + if (unlikely(old_pte & H_PAGE_BUSY)) return 0; /* If PTE permissions don't match, take page fault */ - if (unlikely(access & ~old_pte)) + if (unlikely(!check_pte_access(access, old_pte))) return 1; /* * Check if PTE has the cache-inhibit bit set * If so, bail out and refault as a 4k page */ if (!mmu_has_feature(MMU_FTR_CI_LARGE_PAGE) && - unlikely(old_pte & _PAGE_NO_CACHE)) + unlikely(pte_ci(pte))) return 0; /* * Try to lock the PTE, add ACCESSED and DIRTY if it was * a write access. */ - new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED; - if (access & _PAGE_RW) + new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED; + if (access & _PAGE_WRITE) new_pte |= _PAGE_DIRTY; - } while (old_pte != __cmpxchg_u64((unsigned long *)ptep, - old_pte, new_pte)); + } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); rflags = htab_convert_pte_flags(new_pte); @@ -264,22 +262,22 @@ int __hash_page_64K(unsigned long ea, unsigned long access, rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); vpn = hpt_vpn(ea, vsid, ssize); - if (unlikely(old_pte & _PAGE_HASHPTE)) { + if (unlikely(old_pte & H_PAGE_HASHPTE)) { /* * There MIGHT be an HPTE for this pte */ hash = hpt_hash(vpn, shift, ssize); - if (old_pte & _PAGE_F_SECOND) + if (old_pte & H_PAGE_F_SECOND) hash = ~hash; slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; - slot += (old_pte & _PAGE_F_GIX) >> _PAGE_F_GIX_SHIFT; + slot += (old_pte & H_PAGE_F_GIX) >> H_PAGE_F_GIX_SHIFT; if (ppc_md.hpte_updatepp(slot, rflags, vpn, MMU_PAGE_64K, MMU_PAGE_64K, ssize, flags) == -1) old_pte &= ~_PAGE_HPTEFLAGS; } - if (likely(!(old_pte & _PAGE_HASHPTE))) { + if (likely(!(old_pte & H_PAGE_HASHPTE))) { pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; hash = hpt_hash(vpn, shift, ssize); @@ -319,9 +317,10 @@ int __hash_page_64K(unsigned long ea, unsigned long access, MMU_PAGE_64K, MMU_PAGE_64K, old_pte); return -1; } - new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE; - new_pte |= (slot << _PAGE_F_GIX_SHIFT) & (_PAGE_F_SECOND | _PAGE_F_GIX); + new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE; + new_pte |= (slot << H_PAGE_F_GIX_SHIFT) & + (H_PAGE_F_SECOND | H_PAGE_F_GIX); } - *ptep = __pte(new_pte & ~_PAGE_BUSY); + *ptep = __pte(new_pte & ~H_PAGE_BUSY); return 0; } diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c index 8eaac81347fd..d873f6507f72 100644 --- a/arch/powerpc/mm/hash_native_64.c +++ b/arch/powerpc/mm/hash_native_64.c @@ -221,7 +221,7 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn, return -1; hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID; - hpte_r = hpte_encode_r(pa, psize, apsize) | rflags; + hpte_r = hpte_encode_r(pa, psize, apsize, ssize) | rflags; if (!(vflags & HPTE_V_BOLTED)) { DBG_LOW(" i=%x hpte_v=%016lx, hpte_r=%016lx\n", @@ -719,6 +719,12 @@ static void native_flush_hash_range(unsigned long number, int local) local_irq_restore(flags); } +static int native_update_partition_table(u64 patb1) +{ + partition_tb->patb1 = cpu_to_be64(patb1); + return 0; +} + void __init hpte_init_native(void) { ppc_md.hpte_invalidate = native_hpte_invalidate; @@ -729,4 +735,7 @@ void __init hpte_init_native(void) ppc_md.hpte_clear_all = native_hpte_clear; ppc_md.flush_hash_range = native_flush_hash_range; ppc_md.hugepage_invalidate = native_hugepage_invalidate; + + if (cpu_has_feature(CPU_FTR_ARCH_300)) + ppc_md.update_partition_table = native_update_partition_table; } diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 7635b1c6b5da..59268969a0bc 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -167,16 +167,22 @@ unsigned long htab_convert_pte_flags(unsigned long pteflags) if ((pteflags & _PAGE_EXEC) == 0) rflags |= HPTE_R_N; /* - * PP bits: + * PPP bits: * Linux uses slb key 0 for kernel and 1 for user. - * kernel areas are mapped with PP=00 - * and there is no kernel RO (_PAGE_KERNEL_RO). - * User area is mapped with PP=0x2 for read/write - * or PP=0x3 for read-only (including writeable but clean pages). + * kernel RW areas are mapped with PPP=0b000 + * User area is mapped with PPP=0b010 for read/write + * or PPP=0b011 for read-only (including writeable but clean pages). */ - if (pteflags & _PAGE_USER) { - rflags |= 0x2; - if (!((pteflags & _PAGE_RW) && (pteflags & _PAGE_DIRTY))) + if (pteflags & _PAGE_PRIVILEGED) { + /* + * Kernel read only mapped with ppp bits 0b110 + */ + if (!(pteflags & _PAGE_WRITE)) + rflags |= (HPTE_R_PP0 | 0x2); + } else { + if (pteflags & _PAGE_RWX) + rflags |= 0x2; + if (!((pteflags & _PAGE_WRITE) && (pteflags & _PAGE_DIRTY))) rflags |= 0x1; } /* @@ -186,12 +192,13 @@ unsigned long htab_convert_pte_flags(unsigned long pteflags) /* * Add in WIG bits */ - if (pteflags & _PAGE_WRITETHRU) - rflags |= HPTE_R_W; - if (pteflags & _PAGE_NO_CACHE) + + if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_TOLERANT) rflags |= HPTE_R_I; - if (pteflags & _PAGE_GUARDED) - rflags |= HPTE_R_G; + if ((pteflags & _PAGE_CACHE_CTL ) == _PAGE_NON_IDEMPOTENT) + rflags |= (HPTE_R_I | HPTE_R_G); + if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_SAO) + rflags |= (HPTE_R_I | HPTE_R_W); return rflags; } @@ -669,6 +676,41 @@ int remove_section_mapping(unsigned long start, unsigned long end) } #endif /* CONFIG_MEMORY_HOTPLUG */ +static void __init hash_init_partition_table(phys_addr_t hash_table, + unsigned long pteg_count) +{ + unsigned long ps_field; + unsigned long htab_size; + unsigned long patb_size = 1UL << PATB_SIZE_SHIFT; + + /* + * slb llp encoding for the page size used in VPM real mode. + * We can ignore that for lpid 0 + */ + ps_field = 0; + htab_size = __ilog2(pteg_count) - 11; + + BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 24), "Partition table size too large."); + partition_tb = __va(memblock_alloc_base(patb_size, patb_size, + MEMBLOCK_ALLOC_ANYWHERE)); + + /* Initialize the Partition Table with no entries */ + memset((void *)partition_tb, 0, patb_size); + partition_tb->patb0 = cpu_to_be64(ps_field | hash_table | htab_size); + /* + * FIXME!! This should be done via update_partition table + * For now UPRT is 0 for us. + */ + partition_tb->patb1 = 0; + DBG("Partition table %p\n", partition_tb); + /* + * update partition table control register, + * 64 K size. + */ + mtspr(SPRN_PTCR, __pa(partition_tb) | (PATB_SIZE_SHIFT - 12)); + +} + static void __init htab_initialize(void) { unsigned long table; @@ -737,8 +779,11 @@ static void __init htab_initialize(void) /* Initialize the HPT with no entries */ memset((void *)table, 0, htab_size_bytes); - /* Set SDR1 */ - mtspr(SPRN_SDR1, _SDR1); + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + /* Set SDR1 */ + mtspr(SPRN_SDR1, _SDR1); + else + hash_init_partition_table(table, pteg_count); } prot = pgprot_val(PAGE_KERNEL); @@ -823,8 +868,38 @@ static void __init htab_initialize(void) #undef KB #undef MB -void __init early_init_mmu(void) +void __init hash__early_init_mmu(void) { + /* + * initialize page table size + */ + __pte_frag_nr = H_PTE_FRAG_NR; + __pte_frag_size_shift = H_PTE_FRAG_SIZE_SHIFT; + + __pte_index_size = H_PTE_INDEX_SIZE; + __pmd_index_size = H_PMD_INDEX_SIZE; + __pud_index_size = H_PUD_INDEX_SIZE; + __pgd_index_size = H_PGD_INDEX_SIZE; + __pmd_cache_index = H_PMD_CACHE_INDEX; + __pte_table_size = H_PTE_TABLE_SIZE; + __pmd_table_size = H_PMD_TABLE_SIZE; + __pud_table_size = H_PUD_TABLE_SIZE; + __pgd_table_size = H_PGD_TABLE_SIZE; + /* + * 4k use hugepd format, so for hash set then to + * zero + */ + __pmd_val_bits = 0; + __pud_val_bits = 0; + __pgd_val_bits = 0; + + __kernel_virt_start = H_KERN_VIRT_START; + __kernel_virt_size = H_KERN_VIRT_SIZE; + __vmalloc_start = H_VMALLOC_START; + __vmalloc_end = H_VMALLOC_END; + vmemmap = (struct page *)H_VMEMMAP_BASE; + ioremap_bot = IOREMAP_BASE; + /* Initialize the MMU Hash table and create the linear mapping * of memory. Has to be done before SLB initialization as this is * currently where the page size encoding is obtained. @@ -836,12 +911,16 @@ void __init early_init_mmu(void) } #ifdef CONFIG_SMP -void early_init_mmu_secondary(void) +void hash__early_init_mmu_secondary(void) { /* Initialize hash table for that CPU */ - if (!firmware_has_feature(FW_FEATURE_LPAR)) - mtspr(SPRN_SDR1, _SDR1); - + if (!firmware_has_feature(FW_FEATURE_LPAR)) { + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + mtspr(SPRN_SDR1, _SDR1); + else + mtspr(SPRN_PTCR, + __pa(partition_tb) | (PATB_SIZE_SHIFT - 12)); + } /* Initialize SLB */ slb_initialize(); } @@ -920,7 +999,7 @@ void demote_segment_4k(struct mm_struct *mm, unsigned long addr) * Userspace sets the subpage permissions using the subpage_prot system call. * * Result is 0: full permissions, _PAGE_RW: read-only, - * _PAGE_USER or _PAGE_USER|_PAGE_RW: no access. + * _PAGE_RWX: no access. */ static int subpage_protection(struct mm_struct *mm, unsigned long ea) { @@ -946,8 +1025,13 @@ static int subpage_protection(struct mm_struct *mm, unsigned long ea) /* extract 2-bit bitfield for this 4k subpage */ spp >>= 30 - 2 * ((ea >> 12) & 0xf); - /* turn 0,1,2,3 into combination of _PAGE_USER and _PAGE_RW */ - spp = ((spp & 2) ? _PAGE_USER : 0) | ((spp & 1) ? _PAGE_RW : 0); + /* + * 0 -> full premission + * 1 -> Read only + * 2 -> no access. + * We return the flag that need to be cleared. + */ + spp = ((spp & 2) ? _PAGE_RWX : 0) | ((spp & 1) ? _PAGE_WRITE : 0); return spp; } @@ -1084,7 +1168,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, /* Pre-check access permissions (will be re-checked atomically * in __hash_page_XX but this pre-check is a fast path */ - if (access & ~pte_val(*ptep)) { + if (!check_pte_access(access, pte_val(*ptep))) { DBG_LOW(" no access !\n"); rc = 1; goto bail; @@ -1122,8 +1206,8 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, #endif /* Do actual hashing */ #ifdef CONFIG_PPC_64K_PAGES - /* If _PAGE_4K_PFN is set, make sure this is a 4k segment */ - if ((pte_val(*ptep) & _PAGE_4K_PFN) && psize == MMU_PAGE_64K) { + /* If H_PAGE_4K_PFN is set, make sure this is a 4k segment */ + if ((pte_val(*ptep) & H_PAGE_4K_PFN) && psize == MMU_PAGE_64K) { demote_segment_4k(mm, ea); psize = MMU_PAGE_4K; } @@ -1131,8 +1215,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, /* If this PTE is non-cacheable and we have restrictions on * using non cacheable large pages, then we switch to 4k */ - if (mmu_ci_restrictions && psize == MMU_PAGE_64K && - (pte_val(*ptep) & _PAGE_NO_CACHE)) { + if (mmu_ci_restrictions && psize == MMU_PAGE_64K && pte_ci(*ptep)) { if (user_region) { demote_segment_4k(mm, ea); psize = MMU_PAGE_4K; @@ -1209,7 +1292,7 @@ EXPORT_SYMBOL_GPL(hash_page); int __hash_page(unsigned long ea, unsigned long msr, unsigned long trap, unsigned long dsisr) { - unsigned long access = _PAGE_PRESENT; + unsigned long access = _PAGE_PRESENT | _PAGE_READ; unsigned long flags = 0; struct mm_struct *mm = current->mm; @@ -1220,14 +1303,18 @@ int __hash_page(unsigned long ea, unsigned long msr, unsigned long trap, flags |= HPTE_NOHPTE_UPDATE; if (dsisr & DSISR_ISSTORE) - access |= _PAGE_RW; + access |= _PAGE_WRITE; /* - * We need to set the _PAGE_USER bit if MSR_PR is set or if we are - * accessing a userspace segment (even from the kernel). We assume - * kernel addresses always have the high bit set. + * We set _PAGE_PRIVILEGED only when + * kernel mode access kernel space. + * + * _PAGE_PRIVILEGED is NOT set + * 1) when kernel mode access user space + * 2) user space access kernel space. */ + access |= _PAGE_PRIVILEGED; if ((msr & MSR_PR) || (REGION_ID(ea) == USER_REGION_ID)) - access |= _PAGE_USER; + access &= ~_PAGE_PRIVILEGED; if (trap == 0x400) access |= _PAGE_EXEC; @@ -1235,6 +1322,30 @@ int __hash_page(unsigned long ea, unsigned long msr, unsigned long trap, return hash_page_mm(mm, ea, access, trap, flags); } +#ifdef CONFIG_PPC_MM_SLICES +static bool should_hash_preload(struct mm_struct *mm, unsigned long ea) +{ + int psize = get_slice_psize(mm, ea); + + /* We only prefault standard pages for now */ + if (unlikely(psize != mm->context.user_psize)) + return false; + + /* + * Don't prefault if subpage protection is enabled for the EA. + */ + if (unlikely((psize == MMU_PAGE_4K) && subpage_protection(mm, ea))) + return false; + + return true; +} +#else +static bool should_hash_preload(struct mm_struct *mm, unsigned long ea) +{ + return true; +} +#endif + void hash_preload(struct mm_struct *mm, unsigned long ea, unsigned long access, unsigned long trap) { @@ -1247,11 +1358,8 @@ void hash_preload(struct mm_struct *mm, unsigned long ea, BUG_ON(REGION_ID(ea) != USER_REGION_ID); -#ifdef CONFIG_PPC_MM_SLICES - /* We only prefault standard pages for now */ - if (unlikely(get_slice_psize(mm, ea) != mm->context.user_psize)) + if (!should_hash_preload(mm, ea)) return; -#endif DBG_LOW("hash_preload(mm=%p, mm->pgdir=%p, ea=%016lx, access=%lx," " trap=%lx\n", mm, mm->pgd, ea, access, trap); @@ -1282,13 +1390,13 @@ void hash_preload(struct mm_struct *mm, unsigned long ea, WARN_ON(hugepage_shift); #ifdef CONFIG_PPC_64K_PAGES - /* If either _PAGE_4K_PFN or _PAGE_NO_CACHE is set (and we are on + /* If either H_PAGE_4K_PFN or cache inhibited is set (and we are on * a 64K kernel), then we don't preload, hash_page() will take * care of it once we actually try to access the page. * That way we don't have to duplicate all of the logic for segment * page size demotion here */ - if (pte_val(*ptep) & (_PAGE_4K_PFN | _PAGE_NO_CACHE)) + if ((pte_val(*ptep) & H_PAGE_4K_PFN) || pte_ci(*ptep)) goto out_exit; #endif /* CONFIG_PPC_64K_PAGES */ @@ -1570,7 +1678,7 @@ void __kernel_map_pages(struct page *page, int numpages, int enable) } #endif /* CONFIG_DEBUG_PAGEALLOC */ -void setup_initial_memory_limit(phys_addr_t first_memblock_base, +void hash__setup_initial_memory_limit(phys_addr_t first_memblock_base, phys_addr_t first_memblock_size) { /* We don't currently support the first MEMBLOCK not mapping 0 diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c index eb2accdd76fd..ba3fc229468a 100644 --- a/arch/powerpc/mm/hugepage-hash64.c +++ b/arch/powerpc/mm/hugepage-hash64.c @@ -37,20 +37,20 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid, old_pmd = pmd_val(pmd); /* If PMD busy, retry the access */ - if (unlikely(old_pmd & _PAGE_BUSY)) + if (unlikely(old_pmd & H_PAGE_BUSY)) return 0; /* If PMD permissions don't match, take page fault */ - if (unlikely(access & ~old_pmd)) + if (unlikely(!check_pte_access(access, old_pmd))) return 1; /* * Try to lock the PTE, add ACCESSED and DIRTY if it was * a write access */ - new_pmd = old_pmd | _PAGE_BUSY | _PAGE_ACCESSED; - if (access & _PAGE_RW) + new_pmd = old_pmd | H_PAGE_BUSY | _PAGE_ACCESSED; + if (access & _PAGE_WRITE) new_pmd |= _PAGE_DIRTY; - } while (old_pmd != __cmpxchg_u64((unsigned long *)pmdp, - old_pmd, new_pmd)); + } while (!pmd_xchg(pmdp, __pmd(old_pmd), __pmd(new_pmd))); + rflags = htab_convert_pte_flags(new_pmd); #if 0 @@ -78,7 +78,7 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid, * base page size. This is because demote_segment won't flush * hash page table entries. */ - if ((old_pmd & _PAGE_HASHPTE) && !(old_pmd & _PAGE_COMBO)) { + if ((old_pmd & H_PAGE_HASHPTE) && !(old_pmd & H_PAGE_COMBO)) { flush_hash_hugepage(vsid, ea, pmdp, MMU_PAGE_64K, ssize, flags); /* @@ -125,7 +125,7 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid, hash = hpt_hash(vpn, shift, ssize); /* insert new entry */ pa = pmd_pfn(__pmd(old_pmd)) << PAGE_SHIFT; - new_pmd |= _PAGE_HASHPTE; + new_pmd |= H_PAGE_HASHPTE; repeat: hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL; @@ -169,17 +169,17 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid, mark_hpte_slot_valid(hpte_slot_array, index, slot); } /* - * Mark the pte with _PAGE_COMBO, if we are trying to hash it with + * Mark the pte with H_PAGE_COMBO, if we are trying to hash it with * base page size 4k. */ if (psize == MMU_PAGE_4K) - new_pmd |= _PAGE_COMBO; + new_pmd |= H_PAGE_COMBO; /* * The hpte valid is stored in the pgtable whose address is in the * second half of the PMD. Order this against clearing of the busy bit in * huge pmd. */ smp_wmb(); - *pmdp = __pmd(new_pmd & ~_PAGE_BUSY); + *pmdp = __pmd(new_pmd & ~H_PAGE_BUSY); return 0; } diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c index 8555fce902fe..3058560b6121 100644 --- a/arch/powerpc/mm/hugetlbpage-hash64.c +++ b/arch/powerpc/mm/hugetlbpage-hash64.c @@ -47,18 +47,19 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, do { old_pte = pte_val(*ptep); /* If PTE busy, retry the access */ - if (unlikely(old_pte & _PAGE_BUSY)) + if (unlikely(old_pte & H_PAGE_BUSY)) return 0; /* If PTE permissions don't match, take page fault */ - if (unlikely(access & ~old_pte)) + if (unlikely(!check_pte_access(access, old_pte))) return 1; + /* Try to lock the PTE, add ACCESSED and DIRTY if it was * a write access */ - new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED; - if (access & _PAGE_RW) + new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED; + if (access & _PAGE_WRITE) new_pte |= _PAGE_DIRTY; - } while(old_pte != __cmpxchg_u64((unsigned long *)ptep, - old_pte, new_pte)); + } while(!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); + rflags = htab_convert_pte_flags(new_pte); sz = ((1UL) << shift); @@ -68,28 +69,28 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); /* Check if pte already has an hpte (case 2) */ - if (unlikely(old_pte & _PAGE_HASHPTE)) { + if (unlikely(old_pte & H_PAGE_HASHPTE)) { /* There MIGHT be an HPTE for this pte */ unsigned long hash, slot; hash = hpt_hash(vpn, shift, ssize); - if (old_pte & _PAGE_F_SECOND) + if (old_pte & H_PAGE_F_SECOND) hash = ~hash; slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; - slot += (old_pte & _PAGE_F_GIX) >> _PAGE_F_GIX_SHIFT; + slot += (old_pte & H_PAGE_F_GIX) >> H_PAGE_F_GIX_SHIFT; if (ppc_md.hpte_updatepp(slot, rflags, vpn, mmu_psize, mmu_psize, ssize, flags) == -1) old_pte &= ~_PAGE_HPTEFLAGS; } - if (likely(!(old_pte & _PAGE_HASHPTE))) { + if (likely(!(old_pte & H_PAGE_HASHPTE))) { unsigned long hash = hpt_hash(vpn, shift, ssize); pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; /* clear HPTE slot informations in new PTE */ - new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE; + new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE; slot = hpte_insert_repeating(hash, vpn, pa, rflags, 0, mmu_psize, ssize); @@ -105,14 +106,14 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, return -1; } - new_pte |= (slot << _PAGE_F_GIX_SHIFT) & - (_PAGE_F_SECOND | _PAGE_F_GIX); + new_pte |= (slot << H_PAGE_F_GIX_SHIFT) & + (H_PAGE_F_SECOND | H_PAGE_F_GIX); } /* * No need to use ldarx/stdcx here */ - *ptep = __pte(new_pte & ~_PAGE_BUSY); + *ptep = __pte(new_pte & ~H_PAGE_BUSY); return 0; } diff --git a/arch/powerpc/mm/hugetlbpage-radix.c b/arch/powerpc/mm/hugetlbpage-radix.c new file mode 100644 index 000000000000..1e11559e1aac --- /dev/null +++ b/arch/powerpc/mm/hugetlbpage-radix.c @@ -0,0 +1,87 @@ +#include +#include +#include +#include +#include +#include +#include + +void radix__flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ + unsigned long ap, shift; + struct hstate *hstate = hstate_file(vma->vm_file); + + shift = huge_page_shift(hstate); + if (shift == mmu_psize_defs[MMU_PAGE_2M].shift) + ap = mmu_get_ap(MMU_PAGE_2M); + else if (shift == mmu_psize_defs[MMU_PAGE_1G].shift) + ap = mmu_get_ap(MMU_PAGE_1G); + else { + WARN(1, "Wrong huge page shift\n"); + return ; + } + radix___flush_tlb_page(vma->vm_mm, vmaddr, ap, 0); +} + +void radix__local_flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ + unsigned long ap, shift; + struct hstate *hstate = hstate_file(vma->vm_file); + + shift = huge_page_shift(hstate); + if (shift == mmu_psize_defs[MMU_PAGE_2M].shift) + ap = mmu_get_ap(MMU_PAGE_2M); + else if (shift == mmu_psize_defs[MMU_PAGE_1G].shift) + ap = mmu_get_ap(MMU_PAGE_1G); + else { + WARN(1, "Wrong huge page shift\n"); + return ; + } + radix___local_flush_tlb_page(vma->vm_mm, vmaddr, ap, 0); +} + +/* + * A vairant of hugetlb_get_unmapped_area doing topdown search + * FIXME!! should we do as x86 does or non hugetlb area does ? + * ie, use topdown or not based on mmap_is_legacy check ? + */ +unsigned long +radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + struct hstate *h = hstate_file(file); + struct vm_unmapped_area_info info; + + if (len & ~huge_page_mask(h)) + return -EINVAL; + if (len > TASK_SIZE) + return -ENOMEM; + + if (flags & MAP_FIXED) { + if (prepare_hugepage_range(file, addr, len)) + return -EINVAL; + return addr; + } + + if (addr) { + addr = ALIGN(addr, huge_page_size(h)); + vma = find_vma(mm, addr); + if (TASK_SIZE - len >= addr && + (!vma || addr + len <= vma->vm_start)) + return addr; + } + /* + * We are always doing an topdown search here. Slice code + * does that too. + */ + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + info.length = len; + info.low_limit = PAGE_SIZE; + info.high_limit = current->mm->mmap_base; + info.align_mask = PAGE_MASK & ~huge_page_mask(h); + info.align_offset = 0; + return vm_unmapped_area(&info); +} diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index a4a90a869999..5aac1a3f86cd 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -711,6 +711,9 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, struct hstate *hstate = hstate_file(file); int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate)); + if (radix_enabled()) + return radix__hugetlb_get_unmapped_area(file, addr, len, + pgoff, flags); return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1); } #endif @@ -719,14 +722,14 @@ unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) { #ifdef CONFIG_PPC_MM_SLICES unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start); - - return 1UL << mmu_psize_to_shift(psize); -#else + /* With radix we don't use slice, so derive it from vma*/ + if (!radix_enabled()) + return 1UL << mmu_psize_to_shift(psize); +#endif if (!is_vm_hugetlb_page(vma)) return PAGE_SIZE; return huge_page_size(hstate_vma(vma)); -#endif } static inline bool is_power_of_4(unsigned long x) @@ -825,7 +828,7 @@ static int __init hugetlbpage_init(void) { int psize; - if (!mmu_has_feature(MMU_FTR_16M_PAGE)) + if (!radix_enabled() && !mmu_has_feature(MMU_FTR_16M_PAGE)) return -ENODEV; for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { @@ -865,6 +868,9 @@ static int __init hugetlbpage_init(void) HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift; else if (mmu_psize_defs[MMU_PAGE_1M].shift) HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift; + else if (mmu_psize_defs[MMU_PAGE_2M].shift) + HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_2M].shift; + return 0; } @@ -1005,9 +1011,9 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, end = pte_end; pte = READ_ONCE(*ptep); - mask = _PAGE_PRESENT | _PAGE_USER; + mask = _PAGE_PRESENT | _PAGE_READ; if (write) - mask |= _PAGE_RW; + mask |= _PAGE_WRITE; if ((pte_val(pte) & mask) != mask) return 0; diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index ba655666186d..33709bdb0419 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -66,11 +66,11 @@ #include "mmu_decl.h" #ifdef CONFIG_PPC_STD_MMU_64 -#if PGTABLE_RANGE > USER_VSID_RANGE +#if H_PGTABLE_RANGE > USER_VSID_RANGE #warning Limited user VSID range means pagetable space is wasted #endif -#if (TASK_SIZE_USER64 < PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE) +#if (TASK_SIZE_USER64 < H_PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE) #warning TASK_SIZE is smaller than it needs to be. #endif #endif /* CONFIG_PPC_STD_MMU_64 */ @@ -189,75 +189,6 @@ static int __meminit vmemmap_populated(unsigned long start, int page_size) return 0; } -/* On hash-based CPUs, the vmemmap is bolted in the hash table. - * - * On Book3E CPUs, the vmemmap is currently mapped in the top half of - * the vmalloc space using normal page tables, though the size of - * pages encoded in the PTEs can be different - */ - -#ifdef CONFIG_PPC_BOOK3E -static int __meminit vmemmap_create_mapping(unsigned long start, - unsigned long page_size, - unsigned long phys) -{ - /* Create a PTE encoding without page size */ - unsigned long i, flags = _PAGE_PRESENT | _PAGE_ACCESSED | - _PAGE_KERNEL_RW; - - /* PTEs only contain page size encodings up to 32M */ - BUG_ON(mmu_psize_defs[mmu_vmemmap_psize].enc > 0xf); - - /* Encode the size in the PTE */ - flags |= mmu_psize_defs[mmu_vmemmap_psize].enc << 8; - - /* For each PTE for that area, map things. Note that we don't - * increment phys because all PTEs are of the large size and - * thus must have the low bits clear - */ - for (i = 0; i < page_size; i += PAGE_SIZE) - BUG_ON(map_kernel_page(start + i, phys, flags)); - - return 0; -} - -#ifdef CONFIG_MEMORY_HOTPLUG -static void vmemmap_remove_mapping(unsigned long start, - unsigned long page_size) -{ -} -#endif -#else /* CONFIG_PPC_BOOK3E */ -static int __meminit vmemmap_create_mapping(unsigned long start, - unsigned long page_size, - unsigned long phys) -{ - int rc = htab_bolt_mapping(start, start + page_size, phys, - pgprot_val(PAGE_KERNEL), - mmu_vmemmap_psize, mmu_kernel_ssize); - if (rc < 0) { - int rc2 = htab_remove_mapping(start, start + page_size, - mmu_vmemmap_psize, - mmu_kernel_ssize); - BUG_ON(rc2 && (rc2 != -ENOENT)); - } - return rc; -} - -#ifdef CONFIG_MEMORY_HOTPLUG -static void vmemmap_remove_mapping(unsigned long start, - unsigned long page_size) -{ - int rc = htab_remove_mapping(start, start + page_size, - mmu_vmemmap_psize, - mmu_kernel_ssize); - BUG_ON((rc < 0) && (rc != -ENOENT)); - WARN_ON(rc == -ENOENT); -} -#endif - -#endif /* CONFIG_PPC_BOOK3E */ - struct vmemmap_backing *vmemmap_list; static struct vmemmap_backing *next; static int num_left; diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index ac79dbde1015..2fd57fa48429 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -68,12 +68,15 @@ pte_t *kmap_pte; EXPORT_SYMBOL(kmap_pte); pgprot_t kmap_prot; EXPORT_SYMBOL(kmap_prot); +#define TOP_ZONE ZONE_HIGHMEM static inline pte_t *virt_to_kpte(unsigned long vaddr) { return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), vaddr), vaddr); } +#else +#define TOP_ZONE ZONE_NORMAL #endif int page_is_ram(unsigned long pfn) @@ -267,14 +270,9 @@ void __init limit_zone_pfn(enum zone_type zone, unsigned long pfn_limit) */ int dma_pfn_limit_to_zone(u64 pfn_limit) { - enum zone_type top_zone = ZONE_NORMAL; int i; -#ifdef CONFIG_HIGHMEM - top_zone = ZONE_HIGHMEM; -#endif - - for (i = top_zone; i >= 0; i--) { + for (i = TOP_ZONE; i >= 0; i--) { if (max_zone_pfns[i] <= pfn_limit) return i; } @@ -289,7 +287,6 @@ void __init paging_init(void) { unsigned long long total_ram = memblock_phys_mem_size(); phys_addr_t top_of_ram = memblock_end_of_DRAM(); - enum zone_type top_zone; #ifdef CONFIG_PPC32 unsigned long v = __fix_to_virt(__end_of_fixed_addresses - 1); @@ -313,13 +310,9 @@ void __init paging_init(void) (long int)((top_of_ram - total_ram) >> 20)); #ifdef CONFIG_HIGHMEM - top_zone = ZONE_HIGHMEM; limit_zone_pfn(ZONE_NORMAL, lowmem_end_addr >> PAGE_SHIFT); -#else - top_zone = ZONE_NORMAL; #endif - - limit_zone_pfn(top_zone, top_of_ram >> PAGE_SHIFT); + limit_zone_pfn(TOP_ZONE, top_of_ram >> PAGE_SHIFT); zone_limits_final = true; free_area_init_nodes(max_zone_pfns); @@ -498,7 +491,10 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, * We don't need to worry about _PAGE_PRESENT here because we are * called with either mm->page_table_lock held or ptl lock held */ - unsigned long access = 0, trap; + unsigned long access, trap; + + if (radix_enabled()) + return; /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */ if (!pte_young(*ptep) || address >= TASK_SIZE) @@ -511,13 +507,19 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, * * We also avoid filling the hash if not coming from a fault */ - if (current->thread.regs == NULL) - return; - trap = TRAP(current->thread.regs); - if (trap == 0x400) - access |= _PAGE_EXEC; - else if (trap != 0x300) + + trap = current->thread.regs ? TRAP(current->thread.regs) : 0UL; + switch (trap) { + case 0x300: + access = 0UL; + break; + case 0x400: + access = _PAGE_EXEC; + break; + default: return; + } + hash_preload(vma->vm_mm, address, access, trap); #endif /* CONFIG_PPC_STD_MMU */ #if (defined(CONFIG_PPC_BOOK3E_64) || defined(CONFIG_PPC_FSL_BOOK3E)) \ diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c index 4087705ba90f..2f1e44362198 100644 --- a/arch/powerpc/mm/mmap.c +++ b/arch/powerpc/mm/mmap.c @@ -26,6 +26,9 @@ #include #include #include +#include +#include +#include /* * Top of mmap area (just below the process stack). @@ -78,6 +81,111 @@ static inline unsigned long mmap_base(unsigned long rnd) return PAGE_ALIGN(TASK_SIZE - gap - rnd); } +#ifdef CONFIG_PPC_RADIX_MMU +/* + * Same function as generic code used only for radix, because we don't need to overload + * the generic one. But we will have to duplicate, because hash select + * HAVE_ARCH_UNMAPPED_AREA + */ +static unsigned long +radix__arch_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + struct vm_unmapped_area_info info; + + if (len > TASK_SIZE - mmap_min_addr) + return -ENOMEM; + + if (flags & MAP_FIXED) + return addr; + + if (addr) { + addr = PAGE_ALIGN(addr); + vma = find_vma(mm, addr); + if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && + (!vma || addr + len <= vma->vm_start)) + return addr; + } + + info.flags = 0; + info.length = len; + info.low_limit = mm->mmap_base; + info.high_limit = TASK_SIZE; + info.align_mask = 0; + return vm_unmapped_area(&info); +} + +static unsigned long +radix__arch_get_unmapped_area_topdown(struct file *filp, + const unsigned long addr0, + const unsigned long len, + const unsigned long pgoff, + const unsigned long flags) +{ + struct vm_area_struct *vma; + struct mm_struct *mm = current->mm; + unsigned long addr = addr0; + struct vm_unmapped_area_info info; + + /* requested length too big for entire address space */ + if (len > TASK_SIZE - mmap_min_addr) + return -ENOMEM; + + if (flags & MAP_FIXED) + return addr; + + /* requesting a specific address */ + if (addr) { + addr = PAGE_ALIGN(addr); + vma = find_vma(mm, addr); + if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && + (!vma || addr + len <= vma->vm_start)) + return addr; + } + + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + info.length = len; + info.low_limit = max(PAGE_SIZE, mmap_min_addr); + info.high_limit = mm->mmap_base; + info.align_mask = 0; + addr = vm_unmapped_area(&info); + + /* + * A failed mmap() very likely causes application failure, + * so fall back to the bottom-up function here. This scenario + * can happen with large stack limits and large mmap() + * allocations. + */ + if (addr & ~PAGE_MASK) { + VM_BUG_ON(addr != -ENOMEM); + info.flags = 0; + info.low_limit = TASK_UNMAPPED_BASE; + info.high_limit = TASK_SIZE; + addr = vm_unmapped_area(&info); + } + + return addr; +} + +static void radix__arch_pick_mmap_layout(struct mm_struct *mm, + unsigned long random_factor) +{ + if (mmap_is_legacy()) { + mm->mmap_base = TASK_UNMAPPED_BASE; + mm->get_unmapped_area = radix__arch_get_unmapped_area; + } else { + mm->mmap_base = mmap_base(random_factor); + mm->get_unmapped_area = radix__arch_get_unmapped_area_topdown; + } +} +#else +/* dummy */ +extern void radix__arch_pick_mmap_layout(struct mm_struct *mm, + unsigned long random_factor); +#endif /* * This function, called very early during the creation of a new * process VM image, sets up which VM layout function to use: @@ -89,6 +197,8 @@ void arch_pick_mmap_layout(struct mm_struct *mm) if (current->flags & PF_RANDOMIZE) random_factor = arch_mmap_rnd(); + if (radix_enabled()) + return radix__arch_pick_mmap_layout(mm, random_factor); /* * Fall back to the standard layout if the personality * bit is set, or if the expected stack growth is unlimited: diff --git a/arch/powerpc/mm/mmu_context_hash64.c b/arch/powerpc/mm/mmu_context_book3s64.c similarity index 70% rename from arch/powerpc/mm/mmu_context_hash64.c rename to arch/powerpc/mm/mmu_context_book3s64.c index 9ca6fe16cb29..227b2a6c4544 100644 --- a/arch/powerpc/mm/mmu_context_hash64.c +++ b/arch/powerpc/mm/mmu_context_book3s64.c @@ -58,6 +58,17 @@ int __init_new_context(void) return index; } EXPORT_SYMBOL_GPL(__init_new_context); +static int radix__init_new_context(struct mm_struct *mm, int index) +{ + unsigned long rts_field; + + /* + * set the process table entry, + */ + rts_field = 3ull << PPC_BITLSHIFT(2); + process_tb[index].prtb0 = cpu_to_be64(rts_field | __pa(mm->pgd) | RADIX_PGD_INDEX_SIZE); + return 0; +} int init_new_context(struct task_struct *tsk, struct mm_struct *mm) { @@ -67,13 +78,27 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm) if (index < 0) return index; - /* The old code would re-promote on fork, we don't do that - * when using slices as it could cause problem promoting slices - * that have been forced down to 4K - */ - if (slice_mm_new_context(mm)) - slice_set_user_psize(mm, mmu_virtual_psize); - subpage_prot_init_new_context(mm); + if (radix_enabled()) { + radix__init_new_context(mm, index); + } else { + + /* The old code would re-promote on fork, we don't do that + * when using slices as it could cause problem promoting slices + * that have been forced down to 4K + * + * For book3s we have MMU_NO_CONTEXT set to be ~0. Hence check + * explicitly against context.id == 0. This ensures that we + * properly initialize context slice details for newly allocated + * mm's (which will have id == 0) and don't alter context slice + * inherited via fork (which will have id != 0). + * + * We should not be calling init_new_context() on init_mm. Hence a + * check against 0 is ok. + */ + if (mm->context.id == 0) + slice_set_user_psize(mm, mmu_virtual_psize); + subpage_prot_init_new_context(mm); + } mm->context.id = index; #ifdef CONFIG_PPC_ICSWX mm->context.cop_lockp = kmalloc(sizeof(spinlock_t), GFP_KERNEL); @@ -144,8 +169,19 @@ void destroy_context(struct mm_struct *mm) mm->context.cop_lockp = NULL; #endif /* CONFIG_PPC_ICSWX */ + if (radix_enabled()) + process_tb[mm->context.id].prtb1 = 0; + else + subpage_prot_free(mm); destroy_pagetable_page(mm); __destroy_context(mm->context.id); - subpage_prot_free(mm); mm->context.id = MMU_NO_CONTEXT; } + +#ifdef CONFIG_PPC_RADIX_MMU +void radix__switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) +{ + mtspr(SPRN_PID, next->context.id); + asm volatile("isync": : :"memory"); +} +#endif diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c index 986afbc22c76..7d95bc402dba 100644 --- a/arch/powerpc/mm/mmu_context_nohash.c +++ b/arch/powerpc/mm/mmu_context_nohash.c @@ -226,7 +226,8 @@ static void context_check_map(void) static void context_check_map(void) { } #endif -void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) +void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk) { unsigned int i, id, cpu = smp_processor_id(); unsigned long *map; @@ -334,8 +335,7 @@ int init_new_context(struct task_struct *t, struct mm_struct *mm) mm->context.active = 0; #ifdef CONFIG_PPC_MM_SLICES - if (slice_mm_new_context(mm)) - slice_set_user_psize(mm, mmu_virtual_psize); + slice_set_user_psize(mm, mmu_virtual_psize); #endif return 0; diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index bfb7c0bcabd5..6af65327c993 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -108,11 +108,6 @@ extern unsigned long Hash_size, Hash_mask; #endif /* CONFIG_PPC32 */ -#ifdef CONFIG_PPC64 -extern int map_kernel_page(unsigned long ea, unsigned long pa, - unsigned long flags); -#endif /* CONFIG_PPC64 */ - extern unsigned long ioremap_bot; extern unsigned long __max_low_memory; extern phys_addr_t __initial_memory_limit_addr; diff --git a/arch/powerpc/mm/pgtable-book3e.c b/arch/powerpc/mm/pgtable-book3e.c new file mode 100644 index 000000000000..a2298930f990 --- /dev/null +++ b/arch/powerpc/mm/pgtable-book3e.c @@ -0,0 +1,122 @@ +/* + * Copyright 2005, Paul Mackerras, IBM Corporation. + * Copyright 2009, Benjamin Herrenschmidt, IBM Corporation. + * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include + +#include "mmu_decl.h" + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +/* + * On Book3E CPUs, the vmemmap is currently mapped in the top half of + * the vmalloc space using normal page tables, though the size of + * pages encoded in the PTEs can be different + */ +int __meminit vmemmap_create_mapping(unsigned long start, + unsigned long page_size, + unsigned long phys) +{ + /* Create a PTE encoding without page size */ + unsigned long i, flags = _PAGE_PRESENT | _PAGE_ACCESSED | + _PAGE_KERNEL_RW; + + /* PTEs only contain page size encodings up to 32M */ + BUG_ON(mmu_psize_defs[mmu_vmemmap_psize].enc > 0xf); + + /* Encode the size in the PTE */ + flags |= mmu_psize_defs[mmu_vmemmap_psize].enc << 8; + + /* For each PTE for that area, map things. Note that we don't + * increment phys because all PTEs are of the large size and + * thus must have the low bits clear + */ + for (i = 0; i < page_size; i += PAGE_SIZE) + BUG_ON(map_kernel_page(start + i, phys, flags)); + + return 0; +} + +#ifdef CONFIG_MEMORY_HOTPLUG +void vmemmap_remove_mapping(unsigned long start, + unsigned long page_size) +{ +} +#endif +#endif /* CONFIG_SPARSEMEM_VMEMMAP */ + +static __ref void *early_alloc_pgtable(unsigned long size) +{ + void *pt; + + pt = __va(memblock_alloc_base(size, size, __pa(MAX_DMA_ADDRESS))); + memset(pt, 0, size); + + return pt; +} + +/* + * map_kernel_page currently only called by __ioremap + * map_kernel_page adds an entry to the ioremap page table + * and adds an entry to the HPT, possibly bolting it + */ +int map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags) +{ + pgd_t *pgdp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + BUILD_BUG_ON(TASK_SIZE_USER64 > PGTABLE_RANGE); + if (slab_is_available()) { + pgdp = pgd_offset_k(ea); + pudp = pud_alloc(&init_mm, pgdp, ea); + if (!pudp) + return -ENOMEM; + pmdp = pmd_alloc(&init_mm, pudp, ea); + if (!pmdp) + return -ENOMEM; + ptep = pte_alloc_kernel(pmdp, ea); + if (!ptep) + return -ENOMEM; + set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, + __pgprot(flags))); + } else { + pgdp = pgd_offset_k(ea); +#ifndef __PAGETABLE_PUD_FOLDED + if (pgd_none(*pgdp)) { + pudp = early_alloc_pgtable(PUD_TABLE_SIZE); + BUG_ON(pudp == NULL); + pgd_populate(&init_mm, pgdp, pudp); + } +#endif /* !__PAGETABLE_PUD_FOLDED */ + pudp = pud_offset(pgdp, ea); + if (pud_none(*pudp)) { + pmdp = early_alloc_pgtable(PMD_TABLE_SIZE); + BUG_ON(pmdp == NULL); + pud_populate(&init_mm, pudp, pmdp); + } + pmdp = pmd_offset(pudp, ea); + if (!pmd_present(*pmdp)) { + ptep = early_alloc_pgtable(PAGE_SIZE); + BUG_ON(ptep == NULL); + pmd_populate_kernel(&init_mm, pmdp, ptep); + } + ptep = pte_offset_kernel(pmdp, ea); + set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, + __pgprot(flags))); + } + + smp_wmb(); + return 0; +} diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c new file mode 100644 index 000000000000..eb4451144746 --- /dev/null +++ b/arch/powerpc/mm/pgtable-book3s64.c @@ -0,0 +1,118 @@ +/* + * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include + +#include "mmu_decl.h" +#include + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +/* + * This is called when relaxing access to a hugepage. It's also called in the page + * fault path when we don't hit any of the major fault cases, ie, a minor + * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have + * handled those two for us, we additionally deal with missing execute + * permission here on some processors + */ +int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp, pmd_t entry, int dirty) +{ + int changed; +#ifdef CONFIG_DEBUG_VM + WARN_ON(!pmd_trans_huge(*pmdp)); + assert_spin_locked(&vma->vm_mm->page_table_lock); +#endif + changed = !pmd_same(*(pmdp), entry); + if (changed) { + __ptep_set_access_flags(pmdp_ptep(pmdp), pmd_pte(entry)); + /* + * Since we are not supporting SW TLB systems, we don't + * have any thing similar to flush_tlb_page_nohash() + */ + } + return changed; +} + +int pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp); +} +/* + * set a new huge pmd. We should not be called for updating + * an existing pmd entry. That should go via pmd_hugepage_update. + */ +void set_pmd_at(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t pmd) +{ +#ifdef CONFIG_DEBUG_VM + WARN_ON(pte_present(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp))); + assert_spin_locked(&mm->page_table_lock); + WARN_ON(!pmd_trans_huge(pmd)); +#endif + trace_hugepage_set_pmd(addr, pmd_val(pmd)); + return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd)); +} +/* + * We use this to invalidate a pmdp entry before switching from a + * hugepte to regular pmd entry. + */ +void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ + pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0); + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + /* + * This ensures that generic code that rely on IRQ disabling + * to prevent a parallel THP split work as expected. + */ + kick_all_cpus_sync(); +} + +static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot) +{ + return __pmd(pmd_val(pmd) | pgprot_val(pgprot)); +} + +pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot) +{ + unsigned long pmdv; + + pmdv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK; + return pmd_set_protbits(__pmd(pmdv), pgprot); +} + +pmd_t mk_pmd(struct page *page, pgprot_t pgprot) +{ + return pfn_pmd(page_to_pfn(page), pgprot); +} + +pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) +{ + unsigned long pmdv; + + pmdv = pmd_val(pmd); + pmdv &= _HPAGE_CHG_MASK; + return pmd_set_protbits(__pmd(pmdv), newprot); +} + +/* + * This is called at the end of handling a user page fault, when the + * fault has been handled by updating a HUGE PMD entry in the linux page tables. + * We use it to preload an HPTE into the hash table corresponding to + * the updated linux HUGE PMD entry. + */ +void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd) +{ + return; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c new file mode 100644 index 000000000000..c23e286a6b8f --- /dev/null +++ b/arch/powerpc/mm/pgtable-hash64.c @@ -0,0 +1,342 @@ +/* + * Copyright 2005, Paul Mackerras, IBM Corporation. + * Copyright 2009, Benjamin Herrenschmidt, IBM Corporation. + * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include + +#include "mmu_decl.h" + +#define CREATE_TRACE_POINTS +#include + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +/* + * On hash-based CPUs, the vmemmap is bolted in the hash table. + * + */ +int __meminit hash__vmemmap_create_mapping(unsigned long start, + unsigned long page_size, + unsigned long phys) +{ + int rc = htab_bolt_mapping(start, start + page_size, phys, + pgprot_val(PAGE_KERNEL), + mmu_vmemmap_psize, mmu_kernel_ssize); + if (rc < 0) { + int rc2 = htab_remove_mapping(start, start + page_size, + mmu_vmemmap_psize, + mmu_kernel_ssize); + BUG_ON(rc2 && (rc2 != -ENOENT)); + } + return rc; +} + +#ifdef CONFIG_MEMORY_HOTPLUG +void hash__vmemmap_remove_mapping(unsigned long start, + unsigned long page_size) +{ + int rc = htab_remove_mapping(start, start + page_size, + mmu_vmemmap_psize, + mmu_kernel_ssize); + BUG_ON((rc < 0) && (rc != -ENOENT)); + WARN_ON(rc == -ENOENT); +} +#endif +#endif /* CONFIG_SPARSEMEM_VMEMMAP */ + +/* + * map_kernel_page currently only called by __ioremap + * map_kernel_page adds an entry to the ioremap page table + * and adds an entry to the HPT, possibly bolting it + */ +int hash__map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags) +{ + pgd_t *pgdp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + BUILD_BUG_ON(TASK_SIZE_USER64 > H_PGTABLE_RANGE); + if (slab_is_available()) { + pgdp = pgd_offset_k(ea); + pudp = pud_alloc(&init_mm, pgdp, ea); + if (!pudp) + return -ENOMEM; + pmdp = pmd_alloc(&init_mm, pudp, ea); + if (!pmdp) + return -ENOMEM; + ptep = pte_alloc_kernel(pmdp, ea); + if (!ptep) + return -ENOMEM; + set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, + __pgprot(flags))); + } else { + /* + * If the mm subsystem is not fully up, we cannot create a + * linux page table entry for this mapping. Simply bolt an + * entry in the hardware page table. + * + */ + if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, flags, + mmu_io_psize, mmu_kernel_ssize)) { + printk(KERN_ERR "Failed to do bolted mapping IO " + "memory at %016lx !\n", pa); + return -ENOMEM; + } + } + + smp_wmb(); + return 0; +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + +unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, unsigned long clr, + unsigned long set) +{ + __be64 old_be, tmp; + unsigned long old; + +#ifdef CONFIG_DEBUG_VM + WARN_ON(!pmd_trans_huge(*pmdp)); + assert_spin_locked(&mm->page_table_lock); +#endif + + __asm__ __volatile__( + "1: ldarx %0,0,%3\n\ + and. %1,%0,%6\n\ + bne- 1b \n\ + andc %1,%0,%4 \n\ + or %1,%1,%7\n\ + stdcx. %1,0,%3 \n\ + bne- 1b" + : "=&r" (old_be), "=&r" (tmp), "=m" (*pmdp) + : "r" (pmdp), "r" (cpu_to_be64(clr)), "m" (*pmdp), + "r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set)) + : "cc" ); + + old = be64_to_cpu(old_be); + + trace_hugepage_update(addr, old, clr, set); + if (old & H_PAGE_HASHPTE) + hpte_do_hugepage_flush(mm, addr, pmdp, old); + return old; +} + +pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ + pmd_t pmd; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + VM_BUG_ON(pmd_trans_huge(*pmdp)); + + pmd = *pmdp; + pmd_clear(pmdp); + /* + * Wait for all pending hash_page to finish. This is needed + * in case of subpage collapse. When we collapse normal pages + * to hugepage, we first clear the pmd, then invalidate all + * the PTE entries. The assumption here is that any low level + * page fault will see a none pmd and take the slow path that + * will wait on mmap_sem. But we could very well be in a + * hash_page with local ptep pointer value. Such a hash page + * can result in adding new HPTE entries for normal subpages. + * That means we could be modifying the page content as we + * copy them to a huge page. So wait for parallel hash_page + * to finish before invalidating HPTE entries. We can do this + * by sending an IPI to all the cpus and executing a dummy + * function there. + */ + kick_all_cpus_sync(); + /* + * Now invalidate the hpte entries in the range + * covered by pmd. This make sure we take a + * fault and will find the pmd as none, which will + * result in a major fault which takes mmap_sem and + * hence wait for collapse to complete. Without this + * the __collapse_huge_page_copy can result in copying + * the old content. + */ + flush_tlb_pmd_range(vma->vm_mm, &pmd, address); + return pmd; +} + +/* + * We want to put the pgtable in pmd and use pgtable for tracking + * the base page size hptes + */ +void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable) +{ + pgtable_t *pgtable_slot; + assert_spin_locked(&mm->page_table_lock); + /* + * we store the pgtable in the second half of PMD + */ + pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; + *pgtable_slot = pgtable; + /* + * expose the deposited pgtable to other cpus. + * before we set the hugepage PTE at pmd level + * hash fault code looks at the deposted pgtable + * to store hash index values. + */ + smp_wmb(); +} + +pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) +{ + pgtable_t pgtable; + pgtable_t *pgtable_slot; + + assert_spin_locked(&mm->page_table_lock); + pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; + pgtable = *pgtable_slot; + /* + * Once we withdraw, mark the entry NULL. + */ + *pgtable_slot = NULL; + /* + * We store HPTE information in the deposited PTE fragment. + * zero out the content on withdraw. + */ + memset(pgtable, 0, PTE_FRAG_SIZE); + return pgtable; +} + +void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + VM_BUG_ON(REGION_ID(address) != USER_REGION_ID); + + /* + * We can't mark the pmd none here, because that will cause a race + * against exit_mmap. We need to continue mark pmd TRANS HUGE, while + * we spilt, but at the same time we wan't rest of the ppc64 code + * not to insert hash pte on this, because we will be modifying + * the deposited pgtable in the caller of this function. Hence + * clear the _PAGE_USER so that we move the fault handling to + * higher level function and that will serialize against ptl. + * We need to flush existing hash pte entries here even though, + * the translation is still valid, because we will withdraw + * pgtable_t after this. + */ + pmd_hugepage_update(vma->vm_mm, address, pmdp, 0, _PAGE_PRIVILEGED); +} + +/* + * A linux hugepage PMD was changed and the corresponding hash table entries + * neesd to be flushed. + */ +void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, unsigned long old_pmd) +{ + int ssize; + unsigned int psize; + unsigned long vsid; + unsigned long flags = 0; + const struct cpumask *tmp; + + /* get the base page size,vsid and segment size */ +#ifdef CONFIG_DEBUG_VM + psize = get_slice_psize(mm, addr); + BUG_ON(psize == MMU_PAGE_16M); +#endif + if (old_pmd & H_PAGE_COMBO) + psize = MMU_PAGE_4K; + else + psize = MMU_PAGE_64K; + + if (!is_kernel_addr(addr)) { + ssize = user_segment_size(addr); + vsid = get_vsid(mm->context.id, addr, ssize); + WARN_ON(vsid == 0); + } else { + vsid = get_kernel_vsid(addr, mmu_kernel_ssize); + ssize = mmu_kernel_ssize; + } + + tmp = cpumask_of(smp_processor_id()); + if (cpumask_equal(mm_cpumask(mm), tmp)) + flags |= HPTE_LOCAL_UPDATE; + + return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags); +} + +pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + pmd_t old_pmd; + pgtable_t pgtable; + unsigned long old; + pgtable_t *pgtable_slot; + + old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0); + old_pmd = __pmd(old); + /* + * We have pmd == none and we are holding page_table_lock. + * So we can safely go and clear the pgtable hash + * index info. + */ + pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; + pgtable = *pgtable_slot; + /* + * Let's zero out old valid and hash index details + * hash fault look at them. + */ + memset(pgtable, 0, PTE_FRAG_SIZE); + /* + * Serialize against find_linux_pte_or_hugepte which does lock-less + * lookup in page tables with local interrupts disabled. For huge pages + * it casts pmd_t to pte_t. Since format of pte_t is different from + * pmd_t we want to prevent transit from pmd pointing to page table + * to pmd pointing to huge page (and back) while interrupts are disabled. + * We clear pmd to possibly replace it with page table pointer in + * different code paths. So make sure we wait for the parallel + * find_linux_pte_or_hugepage to finish. + */ + kick_all_cpus_sync(); + return old_pmd; +} + +int hash__has_transparent_hugepage(void) +{ + + if (!mmu_has_feature(MMU_FTR_16M_PAGE)) + return 0; + /* + * We support THP only if PMD_SIZE is 16MB. + */ + if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT) + return 0; + /* + * We need to make sure that we support 16MB hugepage in a segement + * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE + * of 64K. + */ + /* + * If we have 64K HPTE, we will be using that by default + */ + if (mmu_psize_defs[MMU_PAGE_64K].shift && + (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1)) + return 0; + /* + * Ok we only have 4K HPTE + */ + if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1) + return 0; + + return 1; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c new file mode 100644 index 000000000000..18b2c11604fa --- /dev/null +++ b/arch/powerpc/mm/pgtable-radix.c @@ -0,0 +1,526 @@ +/* + * Page table handling routines for radix page table. + * + * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +static int native_update_partition_table(u64 patb1) +{ + partition_tb->patb1 = cpu_to_be64(patb1); + return 0; +} + +static __ref void *early_alloc_pgtable(unsigned long size) +{ + void *pt; + + pt = __va(memblock_alloc_base(size, size, MEMBLOCK_ALLOC_ANYWHERE)); + memset(pt, 0, size); + + return pt; +} + +int radix__map_kernel_page(unsigned long ea, unsigned long pa, + pgprot_t flags, + unsigned int map_page_size) +{ + pgd_t *pgdp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + /* + * Make sure task size is correct as per the max adddr + */ + BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE); + if (slab_is_available()) { + pgdp = pgd_offset_k(ea); + pudp = pud_alloc(&init_mm, pgdp, ea); + if (!pudp) + return -ENOMEM; + if (map_page_size == PUD_SIZE) { + ptep = (pte_t *)pudp; + goto set_the_pte; + } + pmdp = pmd_alloc(&init_mm, pudp, ea); + if (!pmdp) + return -ENOMEM; + if (map_page_size == PMD_SIZE) { + ptep = (pte_t *)pudp; + goto set_the_pte; + } + ptep = pte_alloc_kernel(pmdp, ea); + if (!ptep) + return -ENOMEM; + } else { + pgdp = pgd_offset_k(ea); + if (pgd_none(*pgdp)) { + pudp = early_alloc_pgtable(PUD_TABLE_SIZE); + BUG_ON(pudp == NULL); + pgd_populate(&init_mm, pgdp, pudp); + } + pudp = pud_offset(pgdp, ea); + if (map_page_size == PUD_SIZE) { + ptep = (pte_t *)pudp; + goto set_the_pte; + } + if (pud_none(*pudp)) { + pmdp = early_alloc_pgtable(PMD_TABLE_SIZE); + BUG_ON(pmdp == NULL); + pud_populate(&init_mm, pudp, pmdp); + } + pmdp = pmd_offset(pudp, ea); + if (map_page_size == PMD_SIZE) { + ptep = (pte_t *)pudp; + goto set_the_pte; + } + if (!pmd_present(*pmdp)) { + ptep = early_alloc_pgtable(PAGE_SIZE); + BUG_ON(ptep == NULL); + pmd_populate_kernel(&init_mm, pmdp, ptep); + } + ptep = pte_offset_kernel(pmdp, ea); + } + +set_the_pte: + set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, flags)); + smp_wmb(); + return 0; +} + +static void __init radix_init_pgtable(void) +{ + int loop_count; + u64 base, end, start_addr; + unsigned long rts_field; + struct memblock_region *reg; + unsigned long linear_page_size; + + /* We don't support slb for radix */ + mmu_slb_size = 0; + /* + * Create the linear mapping, using standard page size for now + */ + loop_count = 0; + for_each_memblock(memory, reg) { + + start_addr = reg->base; + +redo: + if (loop_count < 1 && mmu_psize_defs[MMU_PAGE_1G].shift) + linear_page_size = PUD_SIZE; + else if (loop_count < 2 && mmu_psize_defs[MMU_PAGE_2M].shift) + linear_page_size = PMD_SIZE; + else + linear_page_size = PAGE_SIZE; + + base = _ALIGN_UP(start_addr, linear_page_size); + end = _ALIGN_DOWN(reg->base + reg->size, linear_page_size); + + pr_info("Mapping range 0x%lx - 0x%lx with 0x%lx\n", + (unsigned long)base, (unsigned long)end, + linear_page_size); + + while (base < end) { + radix__map_kernel_page((unsigned long)__va(base), + base, PAGE_KERNEL_X, + linear_page_size); + base += linear_page_size; + } + /* + * map the rest using lower page size + */ + if (end < reg->base + reg->size) { + start_addr = end; + loop_count++; + goto redo; + } + } + /* + * Allocate Partition table and process table for the + * host. + */ + BUILD_BUG_ON_MSG((PRTB_SIZE_SHIFT > 23), "Process table size too large."); + process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT); + /* + * Fill in the process table. + * we support 52 bits, hence 52-28 = 24, 11000 + */ + rts_field = 3ull << PPC_BITLSHIFT(2); + process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE); + /* + * Fill in the partition table. We are suppose to use effective address + * of process table here. But our linear mapping also enable us to use + * physical address here. + */ + ppc_md.update_partition_table(__pa(process_tb) | (PRTB_SIZE_SHIFT - 12) | PATB_GR); + pr_info("Process table %p and radix root for kernel: %p\n", process_tb, init_mm.pgd); +} + +static void __init radix_init_partition_table(void) +{ + unsigned long rts_field; + /* + * we support 52 bits, hence 52-28 = 24, 11000 + */ + rts_field = 3ull << PPC_BITLSHIFT(2); + + BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 24), "Partition table size too large."); + partition_tb = early_alloc_pgtable(1UL << PATB_SIZE_SHIFT); + partition_tb->patb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | + RADIX_PGD_INDEX_SIZE | PATB_HR); + printk("Partition table %p\n", partition_tb); + + memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); + /* + * update partition table control register, + * 64 K size. + */ + mtspr(SPRN_PTCR, __pa(partition_tb) | (PATB_SIZE_SHIFT - 12)); +} + +void __init radix_init_native(void) +{ + ppc_md.update_partition_table = native_update_partition_table; +} + +static int __init get_idx_from_shift(unsigned int shift) +{ + int idx = -1; + + switch (shift) { + case 0xc: + idx = MMU_PAGE_4K; + break; + case 0x10: + idx = MMU_PAGE_64K; + break; + case 0x15: + idx = MMU_PAGE_2M; + break; + case 0x1e: + idx = MMU_PAGE_1G; + break; + } + return idx; +} + +static int __init radix_dt_scan_page_sizes(unsigned long node, + const char *uname, int depth, + void *data) +{ + int size = 0; + int shift, idx; + unsigned int ap; + const __be32 *prop; + const char *type = of_get_flat_dt_prop(node, "device_type", NULL); + + /* We are scanning "cpu" nodes only */ + if (type == NULL || strcmp(type, "cpu") != 0) + return 0; + + prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size); + if (!prop) + return 0; + + pr_info("Page sizes from device-tree:\n"); + for (; size >= 4; size -= 4, ++prop) { + + struct mmu_psize_def *def; + + /* top 3 bit is AP encoding */ + shift = be32_to_cpu(prop[0]) & ~(0xe << 28); + ap = be32_to_cpu(prop[0]) >> 29; + pr_info("Page size sift = %d AP=0x%x\n", shift, ap); + + idx = get_idx_from_shift(shift); + if (idx < 0) + continue; + + def = &mmu_psize_defs[idx]; + def->shift = shift; + def->ap = ap; + } + + /* needed ? */ + cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B; + return 1; +} + +static void __init radix_init_page_sizes(void) +{ + int rc; + + /* + * Try to find the available page sizes in the device-tree + */ + rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL); + if (rc != 0) /* Found */ + goto found; + /* + * let's assume we have page 4k and 64k support + */ + mmu_psize_defs[MMU_PAGE_4K].shift = 12; + mmu_psize_defs[MMU_PAGE_4K].ap = 0x0; + + mmu_psize_defs[MMU_PAGE_64K].shift = 16; + mmu_psize_defs[MMU_PAGE_64K].ap = 0x5; +found: +#ifdef CONFIG_SPARSEMEM_VMEMMAP + if (mmu_psize_defs[MMU_PAGE_2M].shift) { + /* + * map vmemmap using 2M if available + */ + mmu_vmemmap_psize = MMU_PAGE_2M; + } +#endif /* CONFIG_SPARSEMEM_VMEMMAP */ + return; +} + +void __init radix__early_init_mmu(void) +{ + unsigned long lpcr; + /* + * setup LPCR UPRT based on mmu_features + */ + lpcr = mfspr(SPRN_LPCR); + mtspr(SPRN_LPCR, lpcr | LPCR_UPRT); + +#ifdef CONFIG_PPC_64K_PAGES + /* PAGE_SIZE mappings */ + mmu_virtual_psize = MMU_PAGE_64K; +#else + mmu_virtual_psize = MMU_PAGE_4K; +#endif + +#ifdef CONFIG_SPARSEMEM_VMEMMAP + /* vmemmap mapping */ + mmu_vmemmap_psize = mmu_virtual_psize; +#endif + /* + * initialize page table size + */ + __pte_index_size = RADIX_PTE_INDEX_SIZE; + __pmd_index_size = RADIX_PMD_INDEX_SIZE; + __pud_index_size = RADIX_PUD_INDEX_SIZE; + __pgd_index_size = RADIX_PGD_INDEX_SIZE; + __pmd_cache_index = RADIX_PMD_INDEX_SIZE; + __pte_table_size = RADIX_PTE_TABLE_SIZE; + __pmd_table_size = RADIX_PMD_TABLE_SIZE; + __pud_table_size = RADIX_PUD_TABLE_SIZE; + __pgd_table_size = RADIX_PGD_TABLE_SIZE; + + __pmd_val_bits = RADIX_PMD_VAL_BITS; + __pud_val_bits = RADIX_PUD_VAL_BITS; + __pgd_val_bits = RADIX_PGD_VAL_BITS; + + __kernel_virt_start = RADIX_KERN_VIRT_START; + __kernel_virt_size = RADIX_KERN_VIRT_SIZE; + __vmalloc_start = RADIX_VMALLOC_START; + __vmalloc_end = RADIX_VMALLOC_END; + vmemmap = (struct page *)RADIX_VMEMMAP_BASE; + ioremap_bot = IOREMAP_BASE; + /* + * For now radix also use the same frag size + */ + __pte_frag_nr = H_PTE_FRAG_NR; + __pte_frag_size_shift = H_PTE_FRAG_SIZE_SHIFT; + + radix_init_page_sizes(); + if (!firmware_has_feature(FW_FEATURE_LPAR)) + radix_init_partition_table(); + + radix_init_pgtable(); +} + +void radix__early_init_mmu_secondary(void) +{ + unsigned long lpcr; + /* + * setup LPCR UPRT based on mmu_features + */ + lpcr = mfspr(SPRN_LPCR); + mtspr(SPRN_LPCR, lpcr | LPCR_UPRT); + /* + * update partition table control register, 64 K size. + */ + if (!firmware_has_feature(FW_FEATURE_LPAR)) + mtspr(SPRN_PTCR, + __pa(partition_tb) | (PATB_SIZE_SHIFT - 12)); +} + +void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base, + phys_addr_t first_memblock_size) +{ + /* We don't currently support the first MEMBLOCK not mapping 0 + * physical on those processors + */ + BUG_ON(first_memblock_base != 0); + /* + * We limit the allocation that depend on ppc64_rma_size + * to first_memblock_size. We also clamp it to 1GB to + * avoid some funky things such as RTAS bugs. + * + * On radix config we really don't have a limitation + * on real mode access. But keeping it as above works + * well enough. + */ + ppc64_rma_size = min_t(u64, first_memblock_size, 0x40000000); + /* + * Finally limit subsequent allocations. We really don't want + * to limit the memblock allocations to rma_size. FIXME!! should + * we even limit at all ? + */ + memblock_set_current_limit(first_memblock_base + first_memblock_size); +} + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +int __meminit radix__vmemmap_create_mapping(unsigned long start, + unsigned long page_size, + unsigned long phys) +{ + /* Create a PTE encoding */ + unsigned long flags = _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_KERNEL_RW; + + BUG_ON(radix__map_kernel_page(start, phys, __pgprot(flags), page_size)); + return 0; +} + +#ifdef CONFIG_MEMORY_HOTPLUG +void radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size) +{ + /* FIXME!! intel does more. We should free page tables mapping vmemmap ? */ +} +#endif +#endif + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + +unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, unsigned long clr, + unsigned long set) +{ + unsigned long old; + +#ifdef CONFIG_DEBUG_VM + WARN_ON(!radix__pmd_trans_huge(*pmdp)); + assert_spin_locked(&mm->page_table_lock); +#endif + + old = radix__pte_update(mm, addr, (pte_t *)pmdp, clr, set, 1); + trace_hugepage_update(addr, old, clr, set); + + return old; +} + +pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) + +{ + pmd_t pmd; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + VM_BUG_ON(radix__pmd_trans_huge(*pmdp)); + /* + * khugepaged calls this for normal pmd + */ + pmd = *pmdp; + pmd_clear(pmdp); + /*FIXME!! Verify whether we need this kick below */ + kick_all_cpus_sync(); + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + return pmd; +} + +/* + * For us pgtable_t is pte_t *. Inorder to save the deposisted + * page table, we consider the allocated page table as a list + * head. On withdraw we need to make sure we zero out the used + * list_head memory area. + */ +void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable) +{ + struct list_head *lh = (struct list_head *) pgtable; + + assert_spin_locked(pmd_lockptr(mm, pmdp)); + + /* FIFO */ + if (!pmd_huge_pte(mm, pmdp)) + INIT_LIST_HEAD(lh); + else + list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp)); + pmd_huge_pte(mm, pmdp) = pgtable; +} + +pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) +{ + pte_t *ptep; + pgtable_t pgtable; + struct list_head *lh; + + assert_spin_locked(pmd_lockptr(mm, pmdp)); + + /* FIFO */ + pgtable = pmd_huge_pte(mm, pmdp); + lh = (struct list_head *) pgtable; + if (list_empty(lh)) + pmd_huge_pte(mm, pmdp) = NULL; + else { + pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next; + list_del(lh); + } + ptep = (pte_t *) pgtable; + *ptep = __pte(0); + ptep++; + *ptep = __pte(0); + return pgtable; +} + + +pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + pmd_t old_pmd; + unsigned long old; + + old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0); + old_pmd = __pmd(old); + /* + * Serialize against find_linux_pte_or_hugepte which does lock-less + * lookup in page tables with local interrupts disabled. For huge pages + * it casts pmd_t to pte_t. Since format of pte_t is different from + * pmd_t we want to prevent transit from pmd pointing to page table + * to pmd pointing to huge page (and back) while interrupts are disabled. + * We clear pmd to possibly replace it with page table pointer in + * different code paths. So make sure we wait for the parallel + * find_linux_pte_or_hugepage to finish. + */ + kick_all_cpus_sync(); + return old_pmd; +} + +int radix__has_transparent_hugepage(void) +{ + /* For radix 2M at PMD level means thp */ + if (mmu_psize_defs[MMU_PAGE_2M].shift == PMD_SHIFT) + return 1; + return 0; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index de37ff445362..88a307504b5a 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -38,14 +38,25 @@ static inline int is_exec_fault(void) /* We only try to do i/d cache coherency on stuff that looks like * reasonably "normal" PTEs. We currently require a PTE to be present - * and we avoid _PAGE_SPECIAL and _PAGE_NO_CACHE. We also only do that + * and we avoid _PAGE_SPECIAL and cache inhibited pte. We also only do that * on userspace PTEs */ static inline int pte_looks_normal(pte_t pte) { + +#if defined(CONFIG_PPC_BOOK3S_64) + if ((pte_val(pte) & (_PAGE_PRESENT | _PAGE_SPECIAL)) == _PAGE_PRESENT) { + if (pte_ci(pte)) + return 0; + if (pte_user(pte)) + return 1; + } + return 0; +#else return (pte_val(pte) & - (_PAGE_PRESENT | _PAGE_SPECIAL | _PAGE_NO_CACHE | _PAGE_USER)) == - (_PAGE_PRESENT | _PAGE_USER); + (_PAGE_PRESENT | _PAGE_SPECIAL | _PAGE_NO_CACHE | _PAGE_USER)) == + (_PAGE_PRESENT | _PAGE_USER); +#endif } static struct page *maybe_pte_to_page(pte_t pte) @@ -71,6 +82,9 @@ static struct page *maybe_pte_to_page(pte_t pte) static pte_t set_pte_filter(pte_t pte) { + if (radix_enabled()) + return pte; + pte = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS); if (pte_looks_normal(pte) && !(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) || cpu_has_feature(CPU_FTR_NOEXECUTE))) { @@ -177,8 +191,8 @@ void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, * _PAGE_PRESENT, but we can be sure that it is not in hpte. * Hence we can use set_pte_at for them. */ - VM_WARN_ON((pte_val(*ptep) & (_PAGE_PRESENT | _PAGE_USER)) == - (_PAGE_PRESENT | _PAGE_USER)); + VM_WARN_ON(pte_present(*ptep) && !pte_protnone(*ptep)); + /* * Add the pte bit when tryint set a pte */ diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 347106080bb1..e009e0604a8a 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -55,104 +55,63 @@ #include "mmu_decl.h" -#define CREATE_TRACE_POINTS -#include - -/* Some sanity checking */ -#if TASK_SIZE_USER64 > PGTABLE_RANGE -#error TASK_SIZE_USER64 exceeds pagetable range -#endif - #ifdef CONFIG_PPC_STD_MMU_64 #if TASK_SIZE_USER64 > (1UL << (ESID_BITS + SID_SHIFT)) #error TASK_SIZE_USER64 exceeds user VSID range #endif #endif -unsigned long ioremap_bot = IOREMAP_BASE; - -#ifdef CONFIG_PPC_MMU_NOHASH -static __ref void *early_alloc_pgtable(unsigned long size) -{ - void *pt; - - pt = __va(memblock_alloc_base(size, size, __pa(MAX_DMA_ADDRESS))); - memset(pt, 0, size); - - return pt; -} -#endif /* CONFIG_PPC_MMU_NOHASH */ - +#ifdef CONFIG_PPC_BOOK3S_64 /* - * map_kernel_page currently only called by __ioremap - * map_kernel_page adds an entry to the ioremap page table - * and adds an entry to the HPT, possibly bolting it + * partition table and process table for ISA 3.0 */ -int map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags) -{ - pgd_t *pgdp; - pud_t *pudp; - pmd_t *pmdp; - pte_t *ptep; - - if (slab_is_available()) { - pgdp = pgd_offset_k(ea); - pudp = pud_alloc(&init_mm, pgdp, ea); - if (!pudp) - return -ENOMEM; - pmdp = pmd_alloc(&init_mm, pudp, ea); - if (!pmdp) - return -ENOMEM; - ptep = pte_alloc_kernel(pmdp, ea); - if (!ptep) - return -ENOMEM; - set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, - __pgprot(flags))); - } else { -#ifdef CONFIG_PPC_MMU_NOHASH - pgdp = pgd_offset_k(ea); -#ifdef PUD_TABLE_SIZE - if (pgd_none(*pgdp)) { - pudp = early_alloc_pgtable(PUD_TABLE_SIZE); - BUG_ON(pudp == NULL); - pgd_populate(&init_mm, pgdp, pudp); - } -#endif /* PUD_TABLE_SIZE */ - pudp = pud_offset(pgdp, ea); - if (pud_none(*pudp)) { - pmdp = early_alloc_pgtable(PMD_TABLE_SIZE); - BUG_ON(pmdp == NULL); - pud_populate(&init_mm, pudp, pmdp); - } - pmdp = pmd_offset(pudp, ea); - if (!pmd_present(*pmdp)) { - ptep = early_alloc_pgtable(PAGE_SIZE); - BUG_ON(ptep == NULL); - pmd_populate_kernel(&init_mm, pmdp, ptep); - } - ptep = pte_offset_kernel(pmdp, ea); - set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, - __pgprot(flags))); -#else /* CONFIG_PPC_MMU_NOHASH */ - /* - * If the mm subsystem is not fully up, we cannot create a - * linux page table entry for this mapping. Simply bolt an - * entry in the hardware page table. - * - */ - if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, flags, - mmu_io_psize, mmu_kernel_ssize)) { - printk(KERN_ERR "Failed to do bolted mapping IO " - "memory at %016lx !\n", pa); - return -ENOMEM; - } -#endif /* !CONFIG_PPC_MMU_NOHASH */ - } - - smp_wmb(); - return 0; -} - +struct prtb_entry *process_tb; +struct patb_entry *partition_tb; +/* + * page table size + */ +unsigned long __pte_index_size; +EXPORT_SYMBOL(__pte_index_size); +unsigned long __pmd_index_size; +EXPORT_SYMBOL(__pmd_index_size); +unsigned long __pud_index_size; +EXPORT_SYMBOL(__pud_index_size); +unsigned long __pgd_index_size; +EXPORT_SYMBOL(__pgd_index_size); +unsigned long __pmd_cache_index; +EXPORT_SYMBOL(__pmd_cache_index); +unsigned long __pte_table_size; +EXPORT_SYMBOL(__pte_table_size); +unsigned long __pmd_table_size; +EXPORT_SYMBOL(__pmd_table_size); +unsigned long __pud_table_size; +EXPORT_SYMBOL(__pud_table_size); +unsigned long __pgd_table_size; +EXPORT_SYMBOL(__pgd_table_size); +unsigned long __pmd_val_bits; +EXPORT_SYMBOL(__pmd_val_bits); +unsigned long __pud_val_bits; +EXPORT_SYMBOL(__pud_val_bits); +unsigned long __pgd_val_bits; +EXPORT_SYMBOL(__pgd_val_bits); +unsigned long __kernel_virt_start; +EXPORT_SYMBOL(__kernel_virt_start); +unsigned long __kernel_virt_size; +EXPORT_SYMBOL(__kernel_virt_size); +unsigned long __vmalloc_start; +EXPORT_SYMBOL(__vmalloc_start); +unsigned long __vmalloc_end; +EXPORT_SYMBOL(__vmalloc_end); +struct page *vmemmap; +EXPORT_SYMBOL(vmemmap); +unsigned long __pte_frag_nr; +EXPORT_SYMBOL(__pte_frag_nr); +unsigned long __pte_frag_size_shift; +EXPORT_SYMBOL(__pte_frag_size_shift); +unsigned long ioremap_bot; +#else /* !CONFIG_PPC_BOOK3S_64 */ +unsigned long ioremap_bot = IOREMAP_BASE; +#endif /** * __ioremap_at - Low level function to establish the page tables @@ -167,12 +126,8 @@ void __iomem * __ioremap_at(phys_addr_t pa, void *ea, unsigned long size, if ((flags & _PAGE_PRESENT) == 0) flags |= pgprot_val(PAGE_KERNEL); - /* Non-cacheable page cannot be coherent */ - if (flags & _PAGE_NO_CACHE) - flags &= ~_PAGE_COHERENT; - /* We don't support the 4K PFN hack with ioremap */ - if (flags & _PAGE_4K_PFN) + if (flags & H_PAGE_4K_PFN) return NULL; WARN_ON(pa & ~PAGE_MASK); @@ -253,7 +208,7 @@ void __iomem * __ioremap(phys_addr_t addr, unsigned long size, void __iomem * ioremap(phys_addr_t addr, unsigned long size) { - unsigned long flags = _PAGE_NO_CACHE | _PAGE_GUARDED; + unsigned long flags = pgprot_val(pgprot_noncached(__pgprot(0))); void *caller = __builtin_return_address(0); if (ppc_md.ioremap) @@ -263,7 +218,7 @@ void __iomem * ioremap(phys_addr_t addr, unsigned long size) void __iomem * ioremap_wc(phys_addr_t addr, unsigned long size) { - unsigned long flags = _PAGE_NO_CACHE; + unsigned long flags = pgprot_val(pgprot_noncached_wc(__pgprot(0))); void *caller = __builtin_return_address(0); if (ppc_md.ioremap) @@ -277,11 +232,20 @@ void __iomem * ioremap_prot(phys_addr_t addr, unsigned long size, void *caller = __builtin_return_address(0); /* writeable implies dirty for kernel addresses */ - if (flags & _PAGE_RW) + if (flags & _PAGE_WRITE) flags |= _PAGE_DIRTY; - /* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */ - flags &= ~(_PAGE_USER | _PAGE_EXEC); + /* we don't want to let _PAGE_EXEC leak out */ + flags &= ~_PAGE_EXEC; + /* + * Force kernel mapping. + */ +#if defined(CONFIG_PPC_BOOK3S_64) + flags |= _PAGE_PRIVILEGED; +#else + flags &= ~_PAGE_USER; +#endif + #ifdef _PAGE_BAP_SR /* _PAGE_USER contains _PAGE_BAP_SR on BookE using the new PTE format @@ -411,7 +375,7 @@ static pte_t *__alloc_for_cache(struct mm_struct *mm, int kernel) return (pte_t *)ret; } -pte_t *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel) +pte_t *pte_fragment_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel) { pte_t *pte; @@ -421,8 +385,9 @@ pte_t *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel) return __alloc_for_cache(mm, kernel); } +#endif /* CONFIG_PPC_64K_PAGES */ -void page_table_free(struct mm_struct *mm, unsigned long *table, int kernel) +void pte_fragment_free(unsigned long *table, int kernel) { struct page *page = virt_to_page(table); if (put_page_testzero(page)) { @@ -433,15 +398,6 @@ void page_table_free(struct mm_struct *mm, unsigned long *table, int kernel) } #ifdef CONFIG_SMP -static void page_table_free_rcu(void *table) -{ - struct page *page = virt_to_page(table); - if (put_page_testzero(page)) { - pgtable_page_dtor(page); - free_hot_cold_page(page, 0); - } -} - void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift) { unsigned long pgf = (unsigned long)table; @@ -458,7 +414,7 @@ void __tlb_remove_table(void *_table) if (!shift) /* PTE page needs special handling */ - page_table_free_rcu(table); + pte_fragment_free(table, 0); else { BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); kmem_cache_free(PGT_CACHE(shift), table); @@ -469,385 +425,10 @@ void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift) { if (!shift) { /* PTE page needs special handling */ - struct page *page = virt_to_page(table); - if (put_page_testzero(page)) { - pgtable_page_dtor(page); - free_hot_cold_page(page, 0); - } + pte_fragment_free(table, 0); } else { BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); kmem_cache_free(PGT_CACHE(shift), table); } } #endif -#endif /* CONFIG_PPC_64K_PAGES */ - -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - -/* - * This is called when relaxing access to a hugepage. It's also called in the page - * fault path when we don't hit any of the major fault cases, ie, a minor - * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have - * handled those two for us, we additionally deal with missing execute - * permission here on some processors - */ -int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmdp, pmd_t entry, int dirty) -{ - int changed; -#ifdef CONFIG_DEBUG_VM - WARN_ON(!pmd_trans_huge(*pmdp)); - assert_spin_locked(&vma->vm_mm->page_table_lock); -#endif - changed = !pmd_same(*(pmdp), entry); - if (changed) { - __ptep_set_access_flags(pmdp_ptep(pmdp), pmd_pte(entry)); - /* - * Since we are not supporting SW TLB systems, we don't - * have any thing similar to flush_tlb_page_nohash() - */ - } - return changed; -} - -unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp, unsigned long clr, - unsigned long set) -{ - - unsigned long old, tmp; - -#ifdef CONFIG_DEBUG_VM - WARN_ON(!pmd_trans_huge(*pmdp)); - assert_spin_locked(&mm->page_table_lock); -#endif - -#ifdef PTE_ATOMIC_UPDATES - __asm__ __volatile__( - "1: ldarx %0,0,%3\n\ - andi. %1,%0,%6\n\ - bne- 1b \n\ - andc %1,%0,%4 \n\ - or %1,%1,%7\n\ - stdcx. %1,0,%3 \n\ - bne- 1b" - : "=&r" (old), "=&r" (tmp), "=m" (*pmdp) - : "r" (pmdp), "r" (clr), "m" (*pmdp), "i" (_PAGE_BUSY), "r" (set) - : "cc" ); -#else - old = pmd_val(*pmdp); - *pmdp = __pmd((old & ~clr) | set); -#endif - trace_hugepage_update(addr, old, clr, set); - if (old & _PAGE_HASHPTE) - hpte_do_hugepage_flush(mm, addr, pmdp, old); - return old; -} - -pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmdp) -{ - pmd_t pmd; - - VM_BUG_ON(address & ~HPAGE_PMD_MASK); - VM_BUG_ON(pmd_trans_huge(*pmdp)); - - pmd = *pmdp; - pmd_clear(pmdp); - /* - * Wait for all pending hash_page to finish. This is needed - * in case of subpage collapse. When we collapse normal pages - * to hugepage, we first clear the pmd, then invalidate all - * the PTE entries. The assumption here is that any low level - * page fault will see a none pmd and take the slow path that - * will wait on mmap_sem. But we could very well be in a - * hash_page with local ptep pointer value. Such a hash page - * can result in adding new HPTE entries for normal subpages. - * That means we could be modifying the page content as we - * copy them to a huge page. So wait for parallel hash_page - * to finish before invalidating HPTE entries. We can do this - * by sending an IPI to all the cpus and executing a dummy - * function there. - */ - kick_all_cpus_sync(); - /* - * Now invalidate the hpte entries in the range - * covered by pmd. This make sure we take a - * fault and will find the pmd as none, which will - * result in a major fault which takes mmap_sem and - * hence wait for collapse to complete. Without this - * the __collapse_huge_page_copy can result in copying - * the old content. - */ - flush_tlb_pmd_range(vma->vm_mm, &pmd, address); - return pmd; -} - -int pmdp_test_and_clear_young(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp) -{ - return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp); -} - -/* - * We currently remove entries from the hashtable regardless of whether - * the entry was young or dirty. The generic routines only flush if the - * entry was young or dirty which is not good enough. - * - * We should be more intelligent about this but for the moment we override - * these functions and force a tlb flush unconditionally - */ -int pmdp_clear_flush_young(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp) -{ - return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp); -} - -/* - * We want to put the pgtable in pmd and use pgtable for tracking - * the base page size hptes - */ -void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, - pgtable_t pgtable) -{ - pgtable_t *pgtable_slot; - assert_spin_locked(&mm->page_table_lock); - /* - * we store the pgtable in the second half of PMD - */ - pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; - *pgtable_slot = pgtable; - /* - * expose the deposited pgtable to other cpus. - * before we set the hugepage PTE at pmd level - * hash fault code looks at the deposted pgtable - * to store hash index values. - */ - smp_wmb(); -} - -pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) -{ - pgtable_t pgtable; - pgtable_t *pgtable_slot; - - assert_spin_locked(&mm->page_table_lock); - pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; - pgtable = *pgtable_slot; - /* - * Once we withdraw, mark the entry NULL. - */ - *pgtable_slot = NULL; - /* - * We store HPTE information in the deposited PTE fragment. - * zero out the content on withdraw. - */ - memset(pgtable, 0, PTE_FRAG_SIZE); - return pgtable; -} - -void pmdp_huge_split_prepare(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp) -{ - VM_BUG_ON(address & ~HPAGE_PMD_MASK); - VM_BUG_ON(REGION_ID(address) != USER_REGION_ID); - - /* - * We can't mark the pmd none here, because that will cause a race - * against exit_mmap. We need to continue mark pmd TRANS HUGE, while - * we spilt, but at the same time we wan't rest of the ppc64 code - * not to insert hash pte on this, because we will be modifying - * the deposited pgtable in the caller of this function. Hence - * clear the _PAGE_USER so that we move the fault handling to - * higher level function and that will serialize against ptl. - * We need to flush existing hash pte entries here even though, - * the translation is still valid, because we will withdraw - * pgtable_t after this. - */ - pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_USER, 0); -} - - -/* - * set a new huge pmd. We should not be called for updating - * an existing pmd entry. That should go via pmd_hugepage_update. - */ -void set_pmd_at(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp, pmd_t pmd) -{ -#ifdef CONFIG_DEBUG_VM - WARN_ON((pmd_val(*pmdp) & (_PAGE_PRESENT | _PAGE_USER)) == - (_PAGE_PRESENT | _PAGE_USER)); - assert_spin_locked(&mm->page_table_lock); - WARN_ON(!pmd_trans_huge(pmd)); -#endif - trace_hugepage_set_pmd(addr, pmd_val(pmd)); - return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd)); -} - -/* - * We use this to invalidate a pmdp entry before switching from a - * hugepte to regular pmd entry. - */ -void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmdp) -{ - pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0); - - /* - * This ensures that generic code that rely on IRQ disabling - * to prevent a parallel THP split work as expected. - */ - kick_all_cpus_sync(); -} - -/* - * A linux hugepage PMD was changed and the corresponding hash table entries - * neesd to be flushed. - */ -void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp, unsigned long old_pmd) -{ - int ssize; - unsigned int psize; - unsigned long vsid; - unsigned long flags = 0; - const struct cpumask *tmp; - - /* get the base page size,vsid and segment size */ -#ifdef CONFIG_DEBUG_VM - psize = get_slice_psize(mm, addr); - BUG_ON(psize == MMU_PAGE_16M); -#endif - if (old_pmd & _PAGE_COMBO) - psize = MMU_PAGE_4K; - else - psize = MMU_PAGE_64K; - - if (!is_kernel_addr(addr)) { - ssize = user_segment_size(addr); - vsid = get_vsid(mm->context.id, addr, ssize); - WARN_ON(vsid == 0); - } else { - vsid = get_kernel_vsid(addr, mmu_kernel_ssize); - ssize = mmu_kernel_ssize; - } - - tmp = cpumask_of(smp_processor_id()); - if (cpumask_equal(mm_cpumask(mm), tmp)) - flags |= HPTE_LOCAL_UPDATE; - - return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags); -} - -static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot) -{ - return __pmd(pmd_val(pmd) | pgprot_val(pgprot)); -} - -pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot) -{ - unsigned long pmdv; - - pmdv = (pfn << PTE_RPN_SHIFT) & PTE_RPN_MASK; - return pmd_set_protbits(__pmd(pmdv), pgprot); -} - -pmd_t mk_pmd(struct page *page, pgprot_t pgprot) -{ - return pfn_pmd(page_to_pfn(page), pgprot); -} - -pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) -{ - unsigned long pmdv; - - pmdv = pmd_val(pmd); - pmdv &= _HPAGE_CHG_MASK; - return pmd_set_protbits(__pmd(pmdv), newprot); -} - -/* - * This is called at the end of handling a user page fault, when the - * fault has been handled by updating a HUGE PMD entry in the linux page tables. - * We use it to preload an HPTE into the hash table corresponding to - * the updated linux HUGE PMD entry. - */ -void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd) -{ - return; -} - -pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, - unsigned long addr, pmd_t *pmdp) -{ - pmd_t old_pmd; - pgtable_t pgtable; - unsigned long old; - pgtable_t *pgtable_slot; - - old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0); - old_pmd = __pmd(old); - /* - * We have pmd == none and we are holding page_table_lock. - * So we can safely go and clear the pgtable hash - * index info. - */ - pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; - pgtable = *pgtable_slot; - /* - * Let's zero out old valid and hash index details - * hash fault look at them. - */ - memset(pgtable, 0, PTE_FRAG_SIZE); - /* - * Serialize against find_linux_pte_or_hugepte which does lock-less - * lookup in page tables with local interrupts disabled. For huge pages - * it casts pmd_t to pte_t. Since format of pte_t is different from - * pmd_t we want to prevent transit from pmd pointing to page table - * to pmd pointing to huge page (and back) while interrupts are disabled. - * We clear pmd to possibly replace it with page table pointer in - * different code paths. So make sure we wait for the parallel - * find_linux_pte_or_hugepage to finish. - */ - kick_all_cpus_sync(); - return old_pmd; -} - -int has_transparent_hugepage(void) -{ - - BUILD_BUG_ON_MSG((PMD_SHIFT - PAGE_SHIFT) >= MAX_ORDER, - "hugepages can't be allocated by the buddy allocator"); - - BUILD_BUG_ON_MSG((PMD_SHIFT - PAGE_SHIFT) < 2, - "We need more than 2 pages to do deferred thp split"); - - if (!mmu_has_feature(MMU_FTR_16M_PAGE)) - return 0; - /* - * We support THP only if PMD_SIZE is 16MB. - */ - if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT) - return 0; - /* - * We need to make sure that we support 16MB hugepage in a segement - * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE - * of 64K. - */ - /* - * If we have 64K HPTE, we will be using that by default - */ - if (mmu_psize_defs[MMU_PAGE_64K].shift && - (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1)) - return 0; - /* - * Ok we only have 4K HPTE - */ - if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1) - return 0; - - return 1; -} -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index 825b6873391f..48fc28bab544 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -32,7 +32,6 @@ enum slb_index { }; extern void slb_allocate_realmode(unsigned long ea); -extern void slb_allocate_user(unsigned long ea); static void slb_allocate(unsigned long ea) { diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S index 736d18b3cefd..dfdb90cb4403 100644 --- a/arch/powerpc/mm/slb_low.S +++ b/arch/powerpc/mm/slb_low.S @@ -35,7 +35,7 @@ _GLOBAL(slb_allocate_realmode) * check for bad kernel/user address * (ea & ~REGION_MASK) >= PGTABLE_RANGE */ - rldicr. r9,r3,4,(63 - PGTABLE_EADDR_SIZE - 4) + rldicr. r9,r3,4,(63 - H_PGTABLE_EADDR_SIZE - 4) bne- 8f srdi r9,r3,60 /* get region */ @@ -91,7 +91,7 @@ slb_miss_kernel_load_vmemmap: * can be demoted from 64K -> 4K dynamically on some machines */ clrldi r11,r10,48 - cmpldi r11,(VMALLOC_SIZE >> 28) - 1 + cmpldi r11,(H_VMALLOC_SIZE >> 28) - 1 bgt 5f lhz r11,PACAVMALLOCSLLP(r13) b 6f @@ -179,56 +179,6 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) li r11,SLB_VSID_USER /* flags don't much matter */ b slb_finish_load -#ifdef __DISABLED__ - -/* void slb_allocate_user(unsigned long ea); - * - * Create an SLB entry for the given EA (user or kernel). - * r3 = faulting address, r13 = PACA - * r9, r10, r11 are clobbered by this function - * No other registers are examined or changed. - * - * It is called with translation enabled in order to be able to walk the - * page tables. This is not currently used. - */ -_GLOBAL(slb_allocate_user) - /* r3 = faulting address */ - srdi r10,r3,28 /* get esid */ - - crset 4*cr7+lt /* set "user" flag for later */ - - /* check if we fit in the range covered by the pagetables*/ - srdi. r9,r3,PGTABLE_EADDR_SIZE - crnot 4*cr0+eq,4*cr0+eq - beqlr - - /* now we need to get to the page tables in order to get the page - * size encoding from the PMD. In the future, we'll be able to deal - * with 1T segments too by getting the encoding from the PGD instead - */ - ld r9,PACAPGDIR(r13) - cmpldi cr0,r9,0 - beqlr - rlwinm r11,r10,8,25,28 - ldx r9,r9,r11 /* get pgd_t */ - cmpldi cr0,r9,0 - beqlr - rlwinm r11,r10,3,17,28 - ldx r9,r9,r11 /* get pmd_t */ - cmpldi cr0,r9,0 - beqlr - - /* build vsid flags */ - andi. r11,r9,SLB_VSID_LLP - ori r11,r11,SLB_VSID_USER - - /* get context to calculate proto-VSID */ - ld r9,PACACONTEXTID(r13) - /* fall through slb_finish_load */ - -#endif /* __DISABLED__ */ - - /* * Finish loading of an SLB entry and return * diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index 42954f0b47ac..2b27458902ee 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -37,8 +37,8 @@ #include /* some sanity checks */ -#if (PGTABLE_RANGE >> 43) > SLICE_MASK_SIZE -#error PGTABLE_RANGE exceeds slice_mask high_slices size +#if (H_PGTABLE_RANGE >> 43) > SLICE_MASK_SIZE +#error H_PGTABLE_RANGE exceeds slice_mask high_slices size #endif static DEFINE_SPINLOCK(slice_convert_lock); @@ -395,6 +395,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, /* Sanity checks */ BUG_ON(mm->task_size == 0); + VM_BUG_ON(radix_enabled()); slice_dbg("slice_get_unmapped_area(mm=%p, psize=%d...\n", mm, psize); slice_dbg(" addr=%lx, len=%lx, flags=%lx, topdown=%d\n", @@ -568,6 +569,16 @@ unsigned int get_slice_psize(struct mm_struct *mm, unsigned long addr) unsigned char *hpsizes; int index, mask_index; + /* + * Radix doesn't use slice, but can get enabled along with MMU_SLICE + */ + if (radix_enabled()) { +#ifdef CONFIG_PPC_64K_PAGES + return MMU_PAGE_64K; +#else + return MMU_PAGE_4K; +#endif + } if (addr < SLICE_LOW_TOP) { u64 lpsizes; lpsizes = mm->context.low_slices_psize; @@ -605,6 +616,7 @@ void slice_set_user_psize(struct mm_struct *mm, unsigned int psize) slice_dbg("slice_set_user_psize(mm=%p, psize=%d)\n", mm, psize); + VM_BUG_ON(radix_enabled()); spin_lock_irqsave(&slice_convert_lock, flags); old_psize = mm->context.user_psize; @@ -649,6 +661,7 @@ void slice_set_range_psize(struct mm_struct *mm, unsigned long start, { struct slice_mask mask = slice_range_to_mask(start, len); + VM_BUG_ON(radix_enabled()); slice_convert(mm, mask, psize); } @@ -678,6 +691,9 @@ int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, struct slice_mask mask, available; unsigned int psize = mm->context.user_psize; + if (radix_enabled()) + return 0; + mask = slice_range_to_mask(addr, len); available = slice_mask_for_size(mm, psize); #ifdef CONFIG_PPC_64K_PAGES diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c new file mode 100644 index 000000000000..0fdaf93a3e09 --- /dev/null +++ b/arch/powerpc/mm/tlb-radix.c @@ -0,0 +1,251 @@ +/* + * TLB flush routines for radix kernels. + * + * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include + +#include +#include + +static DEFINE_RAW_SPINLOCK(native_tlbie_lock); + +static inline void __tlbiel_pid(unsigned long pid, int set) +{ + unsigned long rb,rs,ric,prs,r; + + rb = PPC_BIT(53); /* IS = 1 */ + rb |= set << PPC_BITLSHIFT(51); + rs = ((unsigned long)pid) << PPC_BITLSHIFT(31); + prs = 1; /* process scoped */ + r = 1; /* raidx format */ + ric = 2; /* invalidate all the caches */ + + asm volatile("ptesync": : :"memory"); + asm volatile(".long 0x7c000224 | (%0 << 11) | (%1 << 16) |" + "(%2 << 17) | (%3 << 18) | (%4 << 21)" + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + asm volatile("ptesync": : :"memory"); +} + +/* + * We use 128 set in radix mode and 256 set in hpt mode. + */ +static inline void _tlbiel_pid(unsigned long pid) +{ + int set; + + for (set = 0; set < POWER9_TLB_SETS_RADIX ; set++) { + __tlbiel_pid(pid, set); + } + return; +} + +static inline void _tlbie_pid(unsigned long pid) +{ + unsigned long rb,rs,ric,prs,r; + + rb = PPC_BIT(53); /* IS = 1 */ + rs = pid << PPC_BITLSHIFT(31); + prs = 1; /* process scoped */ + r = 1; /* raidx format */ + ric = 2; /* invalidate all the caches */ + + asm volatile("ptesync": : :"memory"); + asm volatile(".long 0x7c000264 | (%0 << 11) | (%1 << 16) |" + "(%2 << 17) | (%3 << 18) | (%4 << 21)" + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + asm volatile("eieio; tlbsync; ptesync": : :"memory"); +} + +static inline void _tlbiel_va(unsigned long va, unsigned long pid, + unsigned long ap) +{ + unsigned long rb,rs,ric,prs,r; + + rb = va & ~(PPC_BITMASK(52, 63)); + rb |= ap << PPC_BITLSHIFT(58); + rs = pid << PPC_BITLSHIFT(31); + prs = 1; /* process scoped */ + r = 1; /* raidx format */ + ric = 0; /* no cluster flush yet */ + + asm volatile("ptesync": : :"memory"); + asm volatile(".long 0x7c000224 | (%0 << 11) | (%1 << 16) |" + "(%2 << 17) | (%3 << 18) | (%4 << 21)" + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + asm volatile("ptesync": : :"memory"); +} + +static inline void _tlbie_va(unsigned long va, unsigned long pid, + unsigned long ap) +{ + unsigned long rb,rs,ric,prs,r; + + rb = va & ~(PPC_BITMASK(52, 63)); + rb |= ap << PPC_BITLSHIFT(58); + rs = pid << PPC_BITLSHIFT(31); + prs = 1; /* process scoped */ + r = 1; /* raidx format */ + ric = 0; /* no cluster flush yet */ + + asm volatile("ptesync": : :"memory"); + asm volatile(".long 0x7c000264 | (%0 << 11) | (%1 << 16) |" + "(%2 << 17) | (%3 << 18) | (%4 << 21)" + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + asm volatile("eieio; tlbsync; ptesync": : :"memory"); +} + +/* + * Base TLB flushing operations: + * + * - flush_tlb_mm(mm) flushes the specified mm context TLB's + * - flush_tlb_page(vma, vmaddr) flushes one page + * - flush_tlb_range(vma, start, end) flushes a range of pages + * - flush_tlb_kernel_range(start, end) flushes kernel pages + * + * - local_* variants of page and mm only apply to the current + * processor + */ +void radix__local_flush_tlb_mm(struct mm_struct *mm) +{ + unsigned int pid; + + preempt_disable(); + pid = mm->context.id; + if (pid != MMU_NO_CONTEXT) + _tlbiel_pid(pid); + preempt_enable(); +} +EXPORT_SYMBOL(radix__local_flush_tlb_mm); + +void radix___local_flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, + unsigned long ap, int nid) +{ + unsigned int pid; + + preempt_disable(); + pid = mm ? mm->context.id : 0; + if (pid != MMU_NO_CONTEXT) + _tlbiel_va(vmaddr, pid, ap); + preempt_enable(); +} + +void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ +#ifdef CONFIG_HUGETLB_PAGE + /* need the return fix for nohash.c */ + if (vma && is_vm_hugetlb_page(vma)) + return __local_flush_hugetlb_page(vma, vmaddr); +#endif + radix___local_flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr, + mmu_get_ap(mmu_virtual_psize), 0); +} +EXPORT_SYMBOL(radix__local_flush_tlb_page); + +#ifdef CONFIG_SMP +static int mm_is_core_local(struct mm_struct *mm) +{ + return cpumask_subset(mm_cpumask(mm), + topology_sibling_cpumask(smp_processor_id())); +} + +void radix__flush_tlb_mm(struct mm_struct *mm) +{ + unsigned int pid; + + preempt_disable(); + pid = mm->context.id; + if (unlikely(pid == MMU_NO_CONTEXT)) + goto no_context; + + if (!mm_is_core_local(mm)) { + int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); + + if (lock_tlbie) + raw_spin_lock(&native_tlbie_lock); + _tlbie_pid(pid); + if (lock_tlbie) + raw_spin_unlock(&native_tlbie_lock); + } else + _tlbiel_pid(pid); +no_context: + preempt_enable(); +} +EXPORT_SYMBOL(radix__flush_tlb_mm); + +void radix___flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, + unsigned long ap, int nid) +{ + unsigned int pid; + + preempt_disable(); + pid = mm ? mm->context.id : 0; + if (unlikely(pid == MMU_NO_CONTEXT)) + goto bail; + if (!mm_is_core_local(mm)) { + int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); + + if (lock_tlbie) + raw_spin_lock(&native_tlbie_lock); + _tlbie_va(vmaddr, pid, ap); + if (lock_tlbie) + raw_spin_unlock(&native_tlbie_lock); + } else + _tlbiel_va(vmaddr, pid, ap); +bail: + preempt_enable(); +} + +void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ +#ifdef CONFIG_HUGETLB_PAGE + if (vma && is_vm_hugetlb_page(vma)) + return flush_hugetlb_page(vma, vmaddr); +#endif + radix___flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr, + mmu_get_ap(mmu_virtual_psize), 0); +} +EXPORT_SYMBOL(radix__flush_tlb_page); + +#endif /* CONFIG_SMP */ + +void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end) +{ + int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); + + if (lock_tlbie) + raw_spin_lock(&native_tlbie_lock); + _tlbie_pid(0); + if (lock_tlbie) + raw_spin_unlock(&native_tlbie_lock); +} +EXPORT_SYMBOL(radix__flush_tlb_kernel_range); + +/* + * Currently, for range flushing, we just do a full mm flush. Because + * we use this in code path where we don' track the page size. + */ +void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, + unsigned long end) + +{ + struct mm_struct *mm = vma->vm_mm; + radix__flush_tlb_mm(mm); +} +EXPORT_SYMBOL(radix__flush_tlb_range); + + +void radix__tlb_flush(struct mmu_gather *tlb) +{ + struct mm_struct *mm = tlb->mm; + radix__flush_tlb_mm(mm); +} diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c index f7b80391bee7..4517aa43a8b1 100644 --- a/arch/powerpc/mm/tlb_hash64.c +++ b/arch/powerpc/mm/tlb_hash64.c @@ -155,7 +155,7 @@ void __flush_tlb_pending(struct ppc64_tlb_batch *batch) batch->index = 0; } -void tlb_flush(struct mmu_gather *tlb) +void hash__tlb_flush(struct mmu_gather *tlb) { struct ppc64_tlb_batch *tlbbatch = &get_cpu_var(ppc64_tlb_batch); @@ -218,7 +218,7 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, pte = pte_val(*ptep); if (is_thp) trace_hugepage_invalidate(start, pte); - if (!(pte & _PAGE_HASHPTE)) + if (!(pte & H_PAGE_HASHPTE)) continue; if (unlikely(is_thp)) hpte_do_hugepage_flush(mm, start, (pmd_t *)ptep, pte); @@ -248,7 +248,7 @@ void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr) start_pte = pte_offset_map(pmd, addr); for (pte = start_pte; pte < start_pte + PTRS_PER_PTE; pte++) { unsigned long pteval = pte_val(*pte); - if (pteval & _PAGE_HASHPTE) + if (pteval & H_PAGE_HASHPTE) hpte_need_flush(mm, addr, pte, pteval, 0); addr += PAGE_SIZE; } diff --git a/arch/powerpc/perf/Makefile b/arch/powerpc/perf/Makefile index f9c083a5652a..77b6394a7c50 100644 --- a/arch/powerpc/perf/Makefile +++ b/arch/powerpc/perf/Makefile @@ -1,6 +1,6 @@ subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror -obj-$(CONFIG_PERF_EVENTS) += callchain.o +obj-$(CONFIG_PERF_EVENTS) += callchain.o perf_regs.o obj-$(CONFIG_PPC_PERF_CTRS) += core-book3s.o bhrb.o obj64-$(CONFIG_PPC_PERF_CTRS) += power4-pmu.o ppc970-pmu.o power5-pmu.o \ diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c index 22d9015c1acc..26d37e6f924e 100644 --- a/arch/powerpc/perf/callchain.c +++ b/arch/powerpc/perf/callchain.c @@ -137,7 +137,7 @@ static int read_user_stack_slow(void __user *ptr, void *buf, int nb) offset = addr & ((1UL << shift) - 1); pte = READ_ONCE(*ptep); - if (!pte_present(pte) || !(pte_val(pte) & _PAGE_USER)) + if (!pte_present(pte) || !pte_user(pte)) goto err_out; pfn = pte_pfn(pte); if (!page_is_ram(pfn)) diff --git a/arch/powerpc/perf/perf_regs.c b/arch/powerpc/perf/perf_regs.c new file mode 100644 index 000000000000..d24a8a3668fa --- /dev/null +++ b/arch/powerpc/perf/perf_regs.c @@ -0,0 +1,104 @@ +/* + * Copyright 2016 Anju T, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define PT_REGS_OFFSET(id, r) [id] = offsetof(struct pt_regs, r) + +#define REG_RESERVED (~((1ULL << PERF_REG_POWERPC_MAX) - 1)) + +static unsigned int pt_regs_offset[PERF_REG_POWERPC_MAX] = { + PT_REGS_OFFSET(PERF_REG_POWERPC_R0, gpr[0]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R1, gpr[1]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R2, gpr[2]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R3, gpr[3]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R4, gpr[4]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R5, gpr[5]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R6, gpr[6]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R7, gpr[7]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R8, gpr[8]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R9, gpr[9]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R10, gpr[10]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R11, gpr[11]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R12, gpr[12]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R13, gpr[13]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R14, gpr[14]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R15, gpr[15]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R16, gpr[16]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R17, gpr[17]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R18, gpr[18]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R19, gpr[19]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R20, gpr[20]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R21, gpr[21]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R22, gpr[22]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R23, gpr[23]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R24, gpr[24]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R25, gpr[25]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R26, gpr[26]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R27, gpr[27]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R28, gpr[28]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R29, gpr[29]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R30, gpr[30]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R31, gpr[31]), + PT_REGS_OFFSET(PERF_REG_POWERPC_NIP, nip), + PT_REGS_OFFSET(PERF_REG_POWERPC_MSR, msr), + PT_REGS_OFFSET(PERF_REG_POWERPC_ORIG_R3, orig_gpr3), + PT_REGS_OFFSET(PERF_REG_POWERPC_CTR, ctr), + PT_REGS_OFFSET(PERF_REG_POWERPC_LINK, link), + PT_REGS_OFFSET(PERF_REG_POWERPC_XER, xer), + PT_REGS_OFFSET(PERF_REG_POWERPC_CCR, ccr), +#ifdef CONFIG_PPC64 + PT_REGS_OFFSET(PERF_REG_POWERPC_SOFTE, softe), +#else + PT_REGS_OFFSET(PERF_REG_POWERPC_SOFTE, mq), +#endif + PT_REGS_OFFSET(PERF_REG_POWERPC_TRAP, trap), + PT_REGS_OFFSET(PERF_REG_POWERPC_DAR, dar), + PT_REGS_OFFSET(PERF_REG_POWERPC_DSISR, dsisr), +}; + +u64 perf_reg_value(struct pt_regs *regs, int idx) +{ + if (WARN_ON_ONCE(idx >= PERF_REG_POWERPC_MAX)) + return 0; + + return regs_get_register(regs, pt_regs_offset[idx]); +} + +int perf_reg_validate(u64 mask) +{ + if (!mask || mask & REG_RESERVED) + return -EINVAL; + return 0; +} + +u64 perf_reg_abi(struct task_struct *task) +{ +#ifdef CONFIG_PPC64 + if (!test_tsk_thread_flag(task, TIF_32BIT)) + return PERF_SAMPLE_REGS_ABI_64; + else +#endif + return PERF_SAMPLE_REGS_ABI_32; +} + +void perf_get_regs_user(struct perf_regs *regs_user, + struct pt_regs *regs, + struct pt_regs *regs_user_copy) +{ + regs_user->regs = task_pt_regs(current); + regs_user->abi = perf_reg_abi(current); +} diff --git a/arch/powerpc/perf/power8-events-list.h b/arch/powerpc/perf/power8-events-list.h index 741b77edd03e..3a2e6e8ebb92 100644 --- a/arch/powerpc/perf/power8-events-list.h +++ b/arch/powerpc/perf/power8-events-list.h @@ -49,3 +49,43 @@ EVENT(PM_L3_PREF_ALL, 0x4e052) EVENT(PM_DTLB_MISS, 0x300fc) /* ITLB Reloaded */ EVENT(PM_ITLB_MISS, 0x400fc) +/* Run_Instructions */ +EVENT(PM_RUN_INST_CMPL, 0x500fa) +/* Alternate event code for PM_RUN_INST_CMPL */ +EVENT(PM_RUN_INST_CMPL_ALT, 0x400fa) +/* Run_cycles */ +EVENT(PM_RUN_CYC, 0x600f4) +/* Alternate event code for Run_cycles */ +EVENT(PM_RUN_CYC_ALT, 0x200f4) +/* Marked store completed */ +EVENT(PM_MRK_ST_CMPL, 0x10134) +/* Alternate event code for Marked store completed */ +EVENT(PM_MRK_ST_CMPL_ALT, 0x301e2) +/* Marked two path branch */ +EVENT(PM_BR_MRK_2PATH, 0x10138) +/* Alternate event code for PM_BR_MRK_2PATH */ +EVENT(PM_BR_MRK_2PATH_ALT, 0x40138) +/* L3 castouts in Mepf state */ +EVENT(PM_L3_CO_MEPF, 0x18082) +/* Alternate event code for PM_L3_CO_MEPF */ +EVENT(PM_L3_CO_MEPF_ALT, 0x3e05e) +/* Data cache was reloaded from a location other than L2 due to a marked load */ +EVENT(PM_MRK_DATA_FROM_L2MISS, 0x1d14e) +/* Alternate event code for PM_MRK_DATA_FROM_L2MISS */ +EVENT(PM_MRK_DATA_FROM_L2MISS_ALT, 0x401e8) +/* Alternate event code for PM_CMPLU_STALL */ +EVENT(PM_CMPLU_STALL_ALT, 0x1e054) +/* Two path branch */ +EVENT(PM_BR_2PATH, 0x20036) +/* Alternate event code for PM_BR_2PATH */ +EVENT(PM_BR_2PATH_ALT, 0x40036) +/* # PPC Dispatched */ +EVENT(PM_INST_DISP, 0x200f2) +/* Alternate event code for PM_INST_DISP */ +EVENT(PM_INST_DISP_ALT, 0x300f2) +/* Marked filter Match */ +EVENT(PM_MRK_FILT_MATCH, 0x2013c) +/* Alternate event code for PM_MRK_FILT_MATCH */ +EVENT(PM_MRK_FILT_MATCH_ALT, 0x3012e) +/* Alternate event code for PM_LD_MISS_L1 */ +EVENT(PM_LD_MISS_L1_ALT, 0x400f0) diff --git a/arch/powerpc/perf/power8-pmu.c b/arch/powerpc/perf/power8-pmu.c index 690d9186a855..7cf3b4378192 100644 --- a/arch/powerpc/perf/power8-pmu.c +++ b/arch/powerpc/perf/power8-pmu.c @@ -274,7 +274,8 @@ static int power8_get_constraint(u64 event, unsigned long *maskp, unsigned long /* Ignore Linux defined bits when checking event below */ base_event = event & ~EVENT_LINUX_MASK; - if (pmc >= 5 && base_event != 0x500fa && base_event != 0x600f4) + if (pmc >= 5 && base_event != PM_RUN_INST_CMPL && + base_event != PM_RUN_CYC) return -1; mask |= CNST_PMC_MASK(pmc); @@ -488,17 +489,17 @@ static int power8_compute_mmcr(u64 event[], int n_ev, /* Table of alternatives, sorted by column 0 */ static const unsigned int event_alternatives[][MAX_ALT] = { - { 0x10134, 0x301e2 }, /* PM_MRK_ST_CMPL */ - { 0x10138, 0x40138 }, /* PM_BR_MRK_2PATH */ - { 0x18082, 0x3e05e }, /* PM_L3_CO_MEPF */ - { 0x1d14e, 0x401e8 }, /* PM_MRK_DATA_FROM_L2MISS */ - { 0x1e054, 0x4000a }, /* PM_CMPLU_STALL */ - { 0x20036, 0x40036 }, /* PM_BR_2PATH */ - { 0x200f2, 0x300f2 }, /* PM_INST_DISP */ - { 0x200f4, 0x600f4 }, /* PM_RUN_CYC */ - { 0x2013c, 0x3012e }, /* PM_MRK_FILT_MATCH */ - { 0x3e054, 0x400f0 }, /* PM_LD_MISS_L1 */ - { 0x400fa, 0x500fa }, /* PM_RUN_INST_CMPL */ + { PM_MRK_ST_CMPL, PM_MRK_ST_CMPL_ALT }, + { PM_BR_MRK_2PATH, PM_BR_MRK_2PATH_ALT }, + { PM_L3_CO_MEPF, PM_L3_CO_MEPF_ALT }, + { PM_MRK_DATA_FROM_L2MISS, PM_MRK_DATA_FROM_L2MISS_ALT }, + { PM_CMPLU_STALL_ALT, PM_CMPLU_STALL }, + { PM_BR_2PATH, PM_BR_2PATH_ALT }, + { PM_INST_DISP, PM_INST_DISP_ALT }, + { PM_RUN_CYC_ALT, PM_RUN_CYC }, + { PM_MRK_FILT_MATCH, PM_MRK_FILT_MATCH_ALT }, + { PM_LD_MISS_L1, PM_LD_MISS_L1_ALT }, + { PM_RUN_INST_CMPL_ALT, PM_RUN_INST_CMPL }, }; /* @@ -546,17 +547,17 @@ static int power8_get_alternatives(u64 event, unsigned int flags, u64 alt[]) j = num_alt; for (i = 0; i < num_alt; ++i) { switch (alt[i]) { - case 0x1e: /* PM_CYC */ - alt[j++] = 0x600f4; /* PM_RUN_CYC */ + case PM_CYC: + alt[j++] = PM_RUN_CYC; break; - case 0x600f4: /* PM_RUN_CYC */ - alt[j++] = 0x1e; + case PM_RUN_CYC: + alt[j++] = PM_CYC; break; - case 0x2: /* PM_PPC_CMPL */ - alt[j++] = 0x500fa; /* PM_RUN_INST_CMPL */ + case PM_INST_CMPL: + alt[j++] = PM_RUN_INST_CMPL; break; - case 0x500fa: /* PM_RUN_INST_CMPL */ - alt[j++] = 0x2; /* PM_PPC_CMPL */ + case PM_RUN_INST_CMPL: + alt[j++] = PM_INST_CMPL; break; } } diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 142dff5e96d6..77e9b8d591fb 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -72,7 +72,7 @@ config PPC_BOOK3S_64 select PPC_FPU select PPC_HAVE_PMU_SUPPORT select SYS_SUPPORTS_HUGETLBFS - select HAVE_ARCH_TRANSPARENT_HUGEPAGE if PPC_64K_PAGES + select HAVE_ARCH_TRANSPARENT_HUGEPAGE select ARCH_SUPPORTS_NUMA_BALANCING select IRQ_WORK @@ -331,6 +331,15 @@ config PPC_STD_MMU_64 def_bool y depends on PPC_STD_MMU && PPC64 +config PPC_RADIX_MMU + bool "Radix MMU Support" + depends on PPC_BOOK3S_64 + default y + help + Enable support for the Power ISA 3.0 Radix style MMU. Currently this + is only implemented by IBM Power9 CPUs, if you don't have one of them + you can probably disable this. + config PPC_MMU_NOHASH def_bool y depends on !PPC_STD_MMU diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c index f7af74f83693..3cbe38fad609 100644 --- a/arch/powerpc/platforms/cell/spu_base.c +++ b/arch/powerpc/platforms/cell/spu_base.c @@ -24,7 +24,7 @@ #include #include -#include +#include #include #include #include @@ -197,7 +197,7 @@ static int __spu_trap_data_map(struct spu *spu, unsigned long ea, u64 dsisr) (REGION_ID(ea) != USER_REGION_ID)) { spin_unlock(&spu->register_lock); - ret = hash_page(ea, _PAGE_PRESENT, 0x300, dsisr); + ret = hash_page(ea, _PAGE_PRESENT | _PAGE_READ, 0x300, dsisr); spin_lock(&spu->register_lock); if (!ret) { @@ -805,7 +805,4 @@ static int __init init_spu_base(void) out: return ret; } -module_init(init_spu_base); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Arnd Bergmann "); +device_initcall(init_spu_base); diff --git a/arch/powerpc/platforms/cell/spufs/fault.c b/arch/powerpc/platforms/cell/spufs/fault.c index d98f845ac777..e29e4d5afa2d 100644 --- a/arch/powerpc/platforms/cell/spufs/fault.c +++ b/arch/powerpc/platforms/cell/spufs/fault.c @@ -141,8 +141,8 @@ int spufs_handle_class1(struct spu_context *ctx) /* we must not hold the lock when entering copro_handle_mm_fault */ spu_release(ctx); - access = (_PAGE_PRESENT | _PAGE_USER); - access |= (dsisr & MFC_DSISR_ACCESS_PUT) ? _PAGE_RW : 0UL; + access = (_PAGE_PRESENT | _PAGE_READ); + access |= (dsisr & MFC_DSISR_ACCESS_PUT) ? _PAGE_WRITE : 0UL; local_irq_save(flags); ret = hash_page(ea, access, 0x300, dsisr); local_irq_restore(flags); diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index 950b3e539057..9226df11bf39 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -75,7 +75,7 @@ static int pnv_eeh_init(void) * and P7IOC separately. So we should regard * PE#0 as valid for PHB3 and P7IOC. */ - if (phb->ioda.reserved_pe != 0) + if (phb->ioda.reserved_pe_idx != 0) eeh_add_flag(EEH_VALID_PE_ZERO); break; @@ -1009,8 +1009,9 @@ static int pnv_eeh_reset_vf_pe(struct eeh_pe *pe, int option) static int pnv_eeh_reset(struct eeh_pe *pe, int option) { struct pci_controller *hose = pe->phb; + struct pnv_phb *phb; struct pci_bus *bus; - int ret; + int64_t rc; /* * For PHB reset, we always have complete reset. For those PEs whose @@ -1026,45 +1027,39 @@ static int pnv_eeh_reset(struct eeh_pe *pe, int option) * reset. The side effect is that EEH core has to clear the frozen * state explicitly after BAR restore. */ - if (pe->type & EEH_PE_PHB) { - ret = pnv_eeh_phb_reset(hose, option); - } else { - struct pnv_phb *phb; - s64 rc; + if (pe->type & EEH_PE_PHB) + return pnv_eeh_phb_reset(hose, option); - /* - * The frozen PE might be caused by PAPR error injection - * registers, which are expected to be cleared after hitting - * frozen PE as stated in the hardware spec. Unfortunately, - * that's not true on P7IOC. So we have to clear it manually - * to avoid recursive EEH errors during recovery. - */ - phb = hose->private_data; - if (phb->model == PNV_PHB_MODEL_P7IOC && - (option == EEH_RESET_HOT || - option == EEH_RESET_FUNDAMENTAL)) { - rc = opal_pci_reset(phb->opal_id, - OPAL_RESET_PHB_ERROR, - OPAL_ASSERT_RESET); - if (rc != OPAL_SUCCESS) { - pr_warn("%s: Failure %lld clearing " - "error injection registers\n", - __func__, rc); - return -EIO; - } + /* + * The frozen PE might be caused by PAPR error injection + * registers, which are expected to be cleared after hitting + * frozen PE as stated in the hardware spec. Unfortunately, + * that's not true on P7IOC. So we have to clear it manually + * to avoid recursive EEH errors during recovery. + */ + phb = hose->private_data; + if (phb->model == PNV_PHB_MODEL_P7IOC && + (option == EEH_RESET_HOT || + option == EEH_RESET_FUNDAMENTAL)) { + rc = opal_pci_reset(phb->opal_id, + OPAL_RESET_PHB_ERROR, + OPAL_ASSERT_RESET); + if (rc != OPAL_SUCCESS) { + pr_warn("%s: Failure %lld clearing error injection registers\n", + __func__, rc); + return -EIO; } - - bus = eeh_pe_bus_get(pe); - if (pe->type & EEH_PE_VF) - ret = pnv_eeh_reset_vf_pe(pe, option); - else if (pci_is_root_bus(bus) || - pci_is_root_bus(bus->parent)) - ret = pnv_eeh_root_reset(hose, option); - else - ret = pnv_eeh_bridge_reset(bus->self, option); } - return ret; + bus = eeh_pe_bus_get(pe); + if (pe->type & EEH_PE_VF) + return pnv_eeh_reset_vf_pe(pe, option); + + if (pci_is_root_bus(bus) || + pci_is_root_bus(bus->parent)) + return pnv_eeh_root_reset(hose, option); + + return pnv_eeh_bridge_reset(bus->self, option); } /** diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index 7229acd9bb3a..0459e100b4e7 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -25,8 +26,6 @@ * Other types of TCE cache invalidation are not functional in the * hardware. */ -#define TCE_KILL_INVAL_ALL PPC_BIT(0) - static struct pci_dev *get_pci_dev(struct device_node *dn) { return PCI_DN(dn)->pcidev; @@ -138,22 +137,17 @@ static struct pnv_ioda_pe *get_gpu_pci_dev_and_pe(struct pnv_ioda_pe *npe, struct pnv_ioda_pe *pe; struct pci_dn *pdn; - if (npe->flags & PNV_IODA_PE_PEER) { - pe = npe->peers[0]; - pdev = pe->pdev; - } else { - pdev = pnv_pci_get_gpu_dev(npe->pdev); - if (!pdev) - return NULL; + pdev = pnv_pci_get_gpu_dev(npe->pdev); + if (!pdev) + return NULL; - pdn = pci_get_pdn(pdev); - if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) - return NULL; + pdn = pci_get_pdn(pdev); + if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) + return NULL; - hose = pci_bus_to_host(pdev->bus); - phb = hose->private_data; - pe = &phb->ioda.pe_array[pdn->pe_number]; - } + hose = pci_bus_to_host(pdev->bus); + phb = hose->private_data; + pe = &phb->ioda.pe_array[pdn->pe_number]; if (gpdev) *gpdev = pdev; @@ -161,92 +155,70 @@ static struct pnv_ioda_pe *get_gpu_pci_dev_and_pe(struct pnv_ioda_pe *npe, return pe; } -void pnv_npu_tce_invalidate_entire(struct pnv_ioda_pe *npe) +long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num, + struct iommu_table *tbl) { struct pnv_phb *phb = npe->phb; + int64_t rc; + const unsigned long size = tbl->it_indirect_levels ? + tbl->it_level_size : tbl->it_size; + const __u64 start_addr = tbl->it_offset << tbl->it_page_shift; + const __u64 win_size = tbl->it_size << tbl->it_page_shift; - if (WARN_ON(phb->type != PNV_PHB_NPU || - !phb->ioda.tce_inval_reg || - !(npe->flags & PNV_IODA_PE_DEV))) - return; + pe_info(npe, "Setting up window %llx..%llx pg=%lx\n", + start_addr, start_addr + win_size - 1, + IOMMU_PAGE_SIZE(tbl)); - mb(); /* Ensure previous TCE table stores are visible */ - __raw_writeq(cpu_to_be64(TCE_KILL_INVAL_ALL), - phb->ioda.tce_inval_reg); -} - -void pnv_npu_tce_invalidate(struct pnv_ioda_pe *npe, - struct iommu_table *tbl, - unsigned long index, - unsigned long npages, - bool rm) -{ - struct pnv_phb *phb = npe->phb; - - /* We can only invalidate the whole cache on NPU */ - unsigned long val = TCE_KILL_INVAL_ALL; - - if (WARN_ON(phb->type != PNV_PHB_NPU || - !phb->ioda.tce_inval_reg || - !(npe->flags & PNV_IODA_PE_DEV))) - return; - - mb(); /* Ensure previous TCE table stores are visible */ - if (rm) - __raw_rm_writeq(cpu_to_be64(val), - (__be64 __iomem *) phb->ioda.tce_inval_reg_phys); - else - __raw_writeq(cpu_to_be64(val), - phb->ioda.tce_inval_reg); -} - -void pnv_npu_init_dma_pe(struct pnv_ioda_pe *npe) -{ - struct pnv_ioda_pe *gpe; - struct pci_dev *gpdev; - int i, avail = -1; - - if (!npe->pdev || !(npe->flags & PNV_IODA_PE_DEV)) - return; - - gpe = get_gpu_pci_dev_and_pe(npe, &gpdev); - if (!gpe) - return; - - for (i = 0; i < PNV_IODA_MAX_PEER_PES; i++) { - /* Nothing to do if the PE is already connected. */ - if (gpe->peers[i] == npe) - return; - - if (!gpe->peers[i]) - avail = i; + rc = opal_pci_map_pe_dma_window(phb->opal_id, + npe->pe_number, + npe->pe_number, + tbl->it_indirect_levels + 1, + __pa(tbl->it_base), + size << 3, + IOMMU_PAGE_SIZE(tbl)); + if (rc) { + pe_err(npe, "Failed to configure TCE table, err %lld\n", rc); + return rc; } + pnv_pci_ioda2_tce_invalidate_entire(phb, false); - if (WARN_ON(avail < 0)) - return; + /* Add the table to the list so its TCE cache will get invalidated */ + pnv_pci_link_table_and_group(phb->hose->node, num, + tbl, &npe->table_group); - gpe->peers[avail] = npe; - gpe->flags |= PNV_IODA_PE_PEER; + return 0; +} - /* - * We assume that the NPU devices only have a single peer PE - * (the GPU PCIe device PE). - */ - npe->peers[0] = gpe; - npe->flags |= PNV_IODA_PE_PEER; +long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num) +{ + struct pnv_phb *phb = npe->phb; + int64_t rc; + + pe_info(npe, "Removing DMA window\n"); + + rc = opal_pci_map_pe_dma_window(phb->opal_id, npe->pe_number, + npe->pe_number, + 0/* levels */, 0/* table address */, + 0/* table size */, 0/* page size */); + if (rc) { + pe_err(npe, "Unmapping failed, ret = %lld\n", rc); + return rc; + } + pnv_pci_ioda2_tce_invalidate_entire(phb, false); + + pnv_pci_unlink_table_and_group(npe->table_group.tables[num], + &npe->table_group); + + return 0; } /* - * For the NPU we want to point the TCE table at the same table as the - * real PCI device. + * Enables 32 bit DMA on NPU. */ -static void pnv_npu_disable_bypass(struct pnv_ioda_pe *npe) +static void pnv_npu_dma_set_32(struct pnv_ioda_pe *npe) { - struct pnv_phb *phb = npe->phb; struct pci_dev *gpdev; struct pnv_ioda_pe *gpe; - void *addr; - unsigned int size; int64_t rc; /* @@ -260,14 +232,7 @@ static void pnv_npu_disable_bypass(struct pnv_ioda_pe *npe) if (!gpe) return; - addr = (void *)gpe->table_group.tables[0]->it_base; - size = gpe->table_group.tables[0]->it_size << 3; - rc = opal_pci_map_pe_dma_window(phb->opal_id, npe->pe_number, - npe->pe_number, 1, __pa(addr), - size, 0x1000); - if (rc != OPAL_SUCCESS) - pr_warn("%s: Error %lld setting DMA window on PHB#%d-PE#%d\n", - __func__, rc, phb->hose->global_number, npe->pe_number); + rc = pnv_npu_set_window(npe, 0, gpe->table_group.tables[0]); /* * We don't initialise npu_pe->tce32_table as we always use @@ -277,72 +242,120 @@ static void pnv_npu_disable_bypass(struct pnv_ioda_pe *npe) } /* - * Enable/disable bypass mode on the NPU. The NPU only supports one + * Enables bypass mode on the NPU. The NPU only supports one * window per link, so bypass needs to be explicitly enabled or * disabled. Unlike for a PHB3 bypass and non-bypass modes can't be * active at the same time. */ -int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe, bool enable) +static int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe) { struct pnv_phb *phb = npe->phb; int64_t rc = 0; + phys_addr_t top = memblock_end_of_DRAM(); if (phb->type != PNV_PHB_NPU || !npe->pdev) return -EINVAL; - if (enable) { - /* Enable the bypass window */ - phys_addr_t top = memblock_end_of_DRAM(); + rc = pnv_npu_unset_window(npe, 0); + if (rc != OPAL_SUCCESS) + return rc; - npe->tce_bypass_base = 0; - top = roundup_pow_of_two(top); - dev_info(&npe->pdev->dev, "Enabling bypass for PE %d\n", - npe->pe_number); - rc = opal_pci_map_pe_dma_window_real(phb->opal_id, - npe->pe_number, npe->pe_number, - npe->tce_bypass_base, top); - } else { - /* - * Disable the bypass window by replacing it with the - * TCE32 window. - */ - pnv_npu_disable_bypass(npe); - } + /* Enable the bypass window */ + + top = roundup_pow_of_two(top); + dev_info(&npe->pdev->dev, "Enabling bypass for PE %d\n", + npe->pe_number); + rc = opal_pci_map_pe_dma_window_real(phb->opal_id, + npe->pe_number, npe->pe_number, + 0 /* bypass base */, top); + + if (rc == OPAL_SUCCESS) + pnv_pci_ioda2_tce_invalidate_entire(phb, false); return rc; } -int pnv_npu_dma_set_mask(struct pci_dev *npdev, u64 dma_mask) +void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass) { - struct pci_controller *hose = pci_bus_to_host(npdev->bus); - struct pnv_phb *phb = hose->private_data; - struct pci_dn *pdn = pci_get_pdn(npdev); - struct pnv_ioda_pe *npe, *gpe; - struct pci_dev *gpdev; - uint64_t top; - bool bypass = false; + int i; + struct pnv_phb *phb; + struct pci_dn *pdn; + struct pnv_ioda_pe *npe; + struct pci_dev *npdev; - if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) - return -ENXIO; + for (i = 0; ; ++i) { + npdev = pnv_pci_get_npu_dev(gpdev, i); - /* We only do bypass if it's enabled on the linked device */ - npe = &phb->ioda.pe_array[pdn->pe_number]; - gpe = get_gpu_pci_dev_and_pe(npe, &gpdev); - if (!gpe) - return -ENODEV; + if (!npdev) + break; - if (gpe->tce_bypass_enabled) { - top = gpe->tce_bypass_base + memblock_end_of_DRAM() - 1; - bypass = (dma_mask >= top); + pdn = pci_get_pdn(npdev); + if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) + return; + + phb = pci_bus_to_host(npdev->bus)->private_data; + + /* We only do bypass if it's enabled on the linked device */ + npe = &phb->ioda.pe_array[pdn->pe_number]; + + if (bypass) { + dev_info(&npdev->dev, + "Using 64-bit DMA iommu bypass\n"); + pnv_npu_dma_set_bypass(npe); + } else { + dev_info(&npdev->dev, "Using 32-bit DMA via iommu\n"); + pnv_npu_dma_set_32(npe); + } + } +} + +/* Switch ownership from platform code to external user (e.g. VFIO) */ +void pnv_npu_take_ownership(struct pnv_ioda_pe *npe) +{ + struct pnv_phb *phb = npe->phb; + int64_t rc; + + /* + * Note: NPU has just a single TVE in the hardware which means that + * while used by the kernel, it can have either 32bit window or + * DMA bypass but never both. So we deconfigure 32bit window only + * if it was enabled at the moment of ownership change. + */ + if (npe->table_group.tables[0]) { + pnv_npu_unset_window(npe, 0); + return; } - if (bypass) - dev_info(&npdev->dev, "Using 64-bit DMA iommu bypass\n"); - else - dev_info(&npdev->dev, "Using 32-bit DMA via iommu\n"); - - pnv_npu_dma_set_bypass(npe, bypass); - *npdev->dev.dma_mask = dma_mask; - - return 0; + /* Disable bypass */ + rc = opal_pci_map_pe_dma_window_real(phb->opal_id, + npe->pe_number, npe->pe_number, + 0 /* bypass base */, 0); + if (rc) { + pe_err(npe, "Failed to disable bypass, err %lld\n", rc); + return; + } + pnv_pci_ioda2_tce_invalidate_entire(npe->phb, false); +} + +struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe) +{ + struct pnv_phb *phb = npe->phb; + struct pci_bus *pbus = phb->hose->bus; + struct pci_dev *npdev, *gpdev = NULL, *gptmp; + struct pnv_ioda_pe *gpe = get_gpu_pci_dev_and_pe(npe, &gpdev); + + if (!gpe || !gpdev) + return NULL; + + list_for_each_entry(npdev, &pbus->devices, bus_list) { + gptmp = pnv_pci_get_gpu_dev(npdev); + + if (gptmp != gpdev) + continue; + + pe_info(gpe, "Attached NPU %s\n", dev_name(&npdev->dev)); + iommu_group_add_device(gpe->table_group.group, &npdev->dev); + } + + return gpe; } diff --git a/arch/powerpc/platforms/powernv/opal-hmi.c b/arch/powerpc/platforms/powernv/opal-hmi.c index d000f4e21981..c0a8201cb4d9 100644 --- a/arch/powerpc/platforms/powernv/opal-hmi.c +++ b/arch/powerpc/platforms/powernv/opal-hmi.c @@ -150,15 +150,17 @@ static void print_nx_checkstop_reason(const char *level, static void print_checkstop_reason(const char *level, struct OpalHMIEvent *hmi_evt) { - switch (hmi_evt->u.xstop_error.xstop_type) { + uint8_t type = hmi_evt->u.xstop_error.xstop_type; + switch (type) { case CHECKSTOP_TYPE_CORE: print_core_checkstop_reason(level, hmi_evt); break; case CHECKSTOP_TYPE_NX: print_nx_checkstop_reason(level, hmi_evt); break; - case CHECKSTOP_TYPE_UNKNOWN: - printk("%s Unknown Malfunction Alert.\n", level); + default: + printk("%s Unknown Malfunction Alert of type %d\n", + level, type); break; } } diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index c5baaf3cc4e5..3a5ea8236db8 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -48,15 +48,16 @@ #include "powernv.h" #include "pci.h" -/* 256M DMA window, 4K TCE pages, 8 bytes TCE */ -#define TCE32_TABLE_SIZE ((0x10000000 / 0x1000) * 8) +#define PNV_IODA1_M64_NUM 16 /* Number of M64 BARs */ +#define PNV_IODA1_M64_SEGS 8 /* Segments per M64 BAR */ +#define PNV_IODA1_DMA32_SEGSIZE 0x10000000 #define POWERNV_IOMMU_DEFAULT_LEVELS 1 #define POWERNV_IOMMU_MAX_LEVELS 5 static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl); -static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, +void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, const char *fmt, ...) { struct va_format vaf; @@ -87,13 +88,6 @@ static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, va_end(args); } -#define pe_err(pe, fmt, ...) \ - pe_level_printk(pe, KERN_ERR, fmt, ##__VA_ARGS__) -#define pe_warn(pe, fmt, ...) \ - pe_level_printk(pe, KERN_WARNING, fmt, ##__VA_ARGS__) -#define pe_info(pe, fmt, ...) \ - pe_level_printk(pe, KERN_INFO, fmt, ##__VA_ARGS__) - static bool pnv_iommu_bypass_disabled __read_mostly; static int __init iommu_setup(char *str) @@ -122,9 +116,17 @@ static inline bool pnv_pci_is_mem_pref_64(unsigned long flags) (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH)); } +static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb, int pe_no) +{ + phb->ioda.pe_array[pe_no].phb = phb; + phb->ioda.pe_array[pe_no].pe_number = pe_no; + + return &phb->ioda.pe_array[pe_no]; +} + static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no) { - if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe)) { + if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe_num)) { pr_warn("%s: Invalid PE %d on PHB#%x\n", __func__, pe_no, phb->hose->global_number); return; @@ -134,32 +136,31 @@ static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no) pr_debug("%s: PE %d was reserved on PHB#%x\n", __func__, pe_no, phb->hose->global_number); - phb->ioda.pe_array[pe_no].phb = phb; - phb->ioda.pe_array[pe_no].pe_number = pe_no; + pnv_ioda_init_pe(phb, pe_no); } -static int pnv_ioda_alloc_pe(struct pnv_phb *phb) +static struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb) { unsigned long pe; do { pe = find_next_zero_bit(phb->ioda.pe_alloc, - phb->ioda.total_pe, 0); - if (pe >= phb->ioda.total_pe) - return IODA_INVALID_PE; + phb->ioda.total_pe_num, 0); + if (pe >= phb->ioda.total_pe_num) + return NULL; } while(test_and_set_bit(pe, phb->ioda.pe_alloc)); - phb->ioda.pe_array[pe].phb = phb; - phb->ioda.pe_array[pe].pe_number = pe; - return pe; + return pnv_ioda_init_pe(phb, pe); } -static void pnv_ioda_free_pe(struct pnv_phb *phb, int pe) +static void pnv_ioda_free_pe(struct pnv_ioda_pe *pe) { - WARN_ON(phb->ioda.pe_array[pe].pdev); + struct pnv_phb *phb = pe->phb; - memset(&phb->ioda.pe_array[pe], 0, sizeof(struct pnv_ioda_pe)); - clear_bit(pe, phb->ioda.pe_alloc); + WARN_ON(pe->pdev); + + memset(pe, 0, sizeof(struct pnv_ioda_pe)); + clear_bit(pe->pe_number, phb->ioda.pe_alloc); } /* The default M64 BAR is shared by all PEs */ @@ -199,13 +200,13 @@ static int pnv_ioda2_init_m64(struct pnv_phb *phb) * expected to be 0 or last one of PE capabicity. */ r = &phb->hose->mem_resources[1]; - if (phb->ioda.reserved_pe == 0) + if (phb->ioda.reserved_pe_idx == 0) r->start += phb->ioda.m64_segsize; - else if (phb->ioda.reserved_pe == (phb->ioda.total_pe - 1)) + else if (phb->ioda.reserved_pe_idx == (phb->ioda.total_pe_num - 1)) r->end -= phb->ioda.m64_segsize; else pr_warn(" Cannot strip M64 segment for reserved PE#%d\n", - phb->ioda.reserved_pe); + phb->ioda.reserved_pe_idx); return 0; @@ -219,7 +220,7 @@ static int pnv_ioda2_init_m64(struct pnv_phb *phb) return -EIO; } -static void pnv_ioda2_reserve_dev_m64_pe(struct pci_dev *pdev, +static void pnv_ioda_reserve_dev_m64_pe(struct pci_dev *pdev, unsigned long *pe_bitmap) { struct pci_controller *hose = pci_bus_to_host(pdev->bus); @@ -246,22 +247,80 @@ static void pnv_ioda2_reserve_dev_m64_pe(struct pci_dev *pdev, } } -static void pnv_ioda2_reserve_m64_pe(struct pci_bus *bus, - unsigned long *pe_bitmap, - bool all) +static int pnv_ioda1_init_m64(struct pnv_phb *phb) +{ + struct resource *r; + int index; + + /* + * There are 16 M64 BARs, each of which has 8 segments. So + * there are as many M64 segments as the maximum number of + * PEs, which is 128. + */ + for (index = 0; index < PNV_IODA1_M64_NUM; index++) { + unsigned long base, segsz = phb->ioda.m64_segsize; + int64_t rc; + + base = phb->ioda.m64_base + + index * PNV_IODA1_M64_SEGS * segsz; + rc = opal_pci_set_phb_mem_window(phb->opal_id, + OPAL_M64_WINDOW_TYPE, index, base, 0, + PNV_IODA1_M64_SEGS * segsz); + if (rc != OPAL_SUCCESS) { + pr_warn(" Error %lld setting M64 PHB#%d-BAR#%d\n", + rc, phb->hose->global_number, index); + goto fail; + } + + rc = opal_pci_phb_mmio_enable(phb->opal_id, + OPAL_M64_WINDOW_TYPE, index, + OPAL_ENABLE_M64_SPLIT); + if (rc != OPAL_SUCCESS) { + pr_warn(" Error %lld enabling M64 PHB#%d-BAR#%d\n", + rc, phb->hose->global_number, index); + goto fail; + } + } + + /* + * Exclude the segment used by the reserved PE, which + * is expected to be 0 or last supported PE#. + */ + r = &phb->hose->mem_resources[1]; + if (phb->ioda.reserved_pe_idx == 0) + r->start += phb->ioda.m64_segsize; + else if (phb->ioda.reserved_pe_idx == (phb->ioda.total_pe_num - 1)) + r->end -= phb->ioda.m64_segsize; + else + WARN(1, "Wrong reserved PE#%d on PHB#%d\n", + phb->ioda.reserved_pe_idx, phb->hose->global_number); + + return 0; + +fail: + for ( ; index >= 0; index--) + opal_pci_phb_mmio_enable(phb->opal_id, + OPAL_M64_WINDOW_TYPE, index, OPAL_DISABLE_M64); + + return -EIO; +} + +static void pnv_ioda_reserve_m64_pe(struct pci_bus *bus, + unsigned long *pe_bitmap, + bool all) { struct pci_dev *pdev; list_for_each_entry(pdev, &bus->devices, bus_list) { - pnv_ioda2_reserve_dev_m64_pe(pdev, pe_bitmap); + pnv_ioda_reserve_dev_m64_pe(pdev, pe_bitmap); if (all && pdev->subordinate) - pnv_ioda2_reserve_m64_pe(pdev->subordinate, - pe_bitmap, all); + pnv_ioda_reserve_m64_pe(pdev->subordinate, + pe_bitmap, all); } } -static int pnv_ioda2_pick_m64_pe(struct pci_bus *bus, bool all) +static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pci_bus *bus, bool all) { struct pci_controller *hose = pci_bus_to_host(bus); struct pnv_phb *phb = hose->private_data; @@ -271,28 +330,28 @@ static int pnv_ioda2_pick_m64_pe(struct pci_bus *bus, bool all) /* Root bus shouldn't use M64 */ if (pci_is_root_bus(bus)) - return IODA_INVALID_PE; + return NULL; /* Allocate bitmap */ - size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long)); + size = _ALIGN_UP(phb->ioda.total_pe_num / 8, sizeof(unsigned long)); pe_alloc = kzalloc(size, GFP_KERNEL); if (!pe_alloc) { pr_warn("%s: Out of memory !\n", __func__); - return IODA_INVALID_PE; + return NULL; } /* Figure out reserved PE numbers by the PE */ - pnv_ioda2_reserve_m64_pe(bus, pe_alloc, all); + pnv_ioda_reserve_m64_pe(bus, pe_alloc, all); /* * the current bus might not own M64 window and that's all * contributed by its child buses. For the case, we needn't * pick M64 dependent PE#. */ - if (bitmap_empty(pe_alloc, phb->ioda.total_pe)) { + if (bitmap_empty(pe_alloc, phb->ioda.total_pe_num)) { kfree(pe_alloc); - return IODA_INVALID_PE; + return NULL; } /* @@ -301,10 +360,11 @@ static int pnv_ioda2_pick_m64_pe(struct pci_bus *bus, bool all) */ master_pe = NULL; i = -1; - while ((i = find_next_bit(pe_alloc, phb->ioda.total_pe, i + 1)) < - phb->ioda.total_pe) { + while ((i = find_next_bit(pe_alloc, phb->ioda.total_pe_num, i + 1)) < + phb->ioda.total_pe_num) { pe = &phb->ioda.pe_array[i]; + phb->ioda.m64_segmap[pe->pe_number] = pe->pe_number; if (!master_pe) { pe->flags |= PNV_IODA_PE_MASTER; INIT_LIST_HEAD(&pe->slaves); @@ -314,10 +374,30 @@ static int pnv_ioda2_pick_m64_pe(struct pci_bus *bus, bool all) pe->master = master_pe; list_add_tail(&pe->list, &master_pe->slaves); } + + /* + * P7IOC supports M64DT, which helps mapping M64 segment + * to one particular PE#. However, PHB3 has fixed mapping + * between M64 segment and PE#. In order to have same logic + * for P7IOC and PHB3, we enforce fixed mapping between M64 + * segment and PE# on P7IOC. + */ + if (phb->type == PNV_PHB_IODA1) { + int64_t rc; + + rc = opal_pci_map_pe_mmio_window(phb->opal_id, + pe->pe_number, OPAL_M64_WINDOW_TYPE, + pe->pe_number / PNV_IODA1_M64_SEGS, + pe->pe_number % PNV_IODA1_M64_SEGS); + if (rc != OPAL_SUCCESS) + pr_warn("%s: Error %lld mapping M64 for PHB#%d-PE#%d\n", + __func__, rc, phb->hose->global_number, + pe->pe_number); + } } kfree(pe_alloc); - return master_pe->pe_number; + return master_pe; } static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb) @@ -328,8 +408,7 @@ static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb) const u32 *r; u64 pci_addr; - /* FIXME: Support M64 for P7IOC */ - if (phb->type != PNV_PHB_IODA2) { + if (phb->type != PNV_PHB_IODA1 && phb->type != PNV_PHB_IODA2) { pr_info(" Not support M64 window\n"); return; } @@ -355,7 +434,7 @@ static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb) hose->mem_offset[1] = res->start - pci_addr; phb->ioda.m64_size = resource_size(res); - phb->ioda.m64_segsize = phb->ioda.m64_size / phb->ioda.total_pe; + phb->ioda.m64_segsize = phb->ioda.m64_size / phb->ioda.total_pe_num; phb->ioda.m64_base = pci_addr; pr_info(" MEM64 0x%016llx..0x%016llx -> 0x%016llx\n", @@ -363,9 +442,12 @@ static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb) /* Use last M64 BAR to cover M64 window */ phb->ioda.m64_bar_idx = 15; - phb->init_m64 = pnv_ioda2_init_m64; - phb->reserve_m64_pe = pnv_ioda2_reserve_m64_pe; - phb->pick_m64_pe = pnv_ioda2_pick_m64_pe; + if (phb->type == PNV_PHB_IODA1) + phb->init_m64 = pnv_ioda1_init_m64; + else + phb->init_m64 = pnv_ioda2_init_m64; + phb->reserve_m64_pe = pnv_ioda_reserve_m64_pe; + phb->pick_m64_pe = pnv_ioda_pick_m64_pe; } static void pnv_ioda_freeze_pe(struct pnv_phb *phb, int pe_no) @@ -456,7 +538,7 @@ static int pnv_ioda_get_pe_state(struct pnv_phb *phb, int pe_no) s64 rc; /* Sanity check on PE number */ - if (pe_no < 0 || pe_no >= phb->ioda.total_pe) + if (pe_no < 0 || pe_no >= phb->ioda.total_pe_num) return OPAL_EEH_STOPPED_PERM_UNAVAIL; /* @@ -808,44 +890,6 @@ static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) return 0; } -static void pnv_ioda_link_pe_by_weight(struct pnv_phb *phb, - struct pnv_ioda_pe *pe) -{ - struct pnv_ioda_pe *lpe; - - list_for_each_entry(lpe, &phb->ioda.pe_dma_list, dma_link) { - if (lpe->dma_weight < pe->dma_weight) { - list_add_tail(&pe->dma_link, &lpe->dma_link); - return; - } - } - list_add_tail(&pe->dma_link, &phb->ioda.pe_dma_list); -} - -static unsigned int pnv_ioda_dma_weight(struct pci_dev *dev) -{ - /* This is quite simplistic. The "base" weight of a device - * is 10. 0 means no DMA is to be accounted for it. - */ - - /* If it's a bridge, no DMA */ - if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL) - return 0; - - /* Reduce the weight of slow USB controllers */ - if (dev->class == PCI_CLASS_SERIAL_USB_UHCI || - dev->class == PCI_CLASS_SERIAL_USB_OHCI || - dev->class == PCI_CLASS_SERIAL_USB_EHCI) - return 3; - - /* Increase the weight of RAID (includes Obsidian) */ - if ((dev->class >> 8) == PCI_CLASS_STORAGE_RAID) - return 15; - - /* Default */ - return 10; -} - #ifdef CONFIG_PCI_IOV static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset) { @@ -919,7 +963,6 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) struct pnv_phb *phb = hose->private_data; struct pci_dn *pdn = pci_get_pdn(dev); struct pnv_ioda_pe *pe; - int pe_num; if (!pdn) { pr_err("%s: Device tree node not associated properly\n", @@ -929,8 +972,8 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) if (pdn->pe_number != IODA_INVALID_PE) return NULL; - pe_num = pnv_ioda_alloc_pe(phb); - if (pe_num == IODA_INVALID_PE) { + pe = pnv_ioda_alloc_pe(phb); + if (!pe) { pr_warning("%s: Not enough PE# available, disabling device\n", pci_name(dev)); return NULL; @@ -943,14 +986,12 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) * * At some point we want to remove the PDN completely anyways */ - pe = &phb->ioda.pe_array[pe_num]; pci_dev_get(dev); pdn->pcidev = dev; - pdn->pe_number = pe_num; + pdn->pe_number = pe->pe_number; pe->flags = PNV_IODA_PE_DEV; pe->pdev = dev; pe->pbus = NULL; - pe->tce32_seg = -1; pe->mve_number = -1; pe->rid = dev->bus->number << 8 | pdn->devfn; @@ -958,23 +999,15 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) if (pnv_ioda_configure_pe(phb, pe)) { /* XXX What do we do here ? */ - if (pe_num) - pnv_ioda_free_pe(phb, pe_num); + pnv_ioda_free_pe(pe); pdn->pe_number = IODA_INVALID_PE; pe->pdev = NULL; pci_dev_put(dev); return NULL; } - /* Assign a DMA weight to the device */ - pe->dma_weight = pnv_ioda_dma_weight(dev); - if (pe->dma_weight != 0) { - phb->ioda.dma_weight += pe->dma_weight; - phb->ioda.dma_pe_count++; - } - - /* Link the PE */ - pnv_ioda_link_pe_by_weight(phb, pe); + /* Put PE to the list */ + list_add_tail(&pe->list, &phb->ioda.pe_list); return pe; } @@ -993,7 +1026,6 @@ static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe) } pdn->pcidev = dev; pdn->pe_number = pe->pe_number; - pe->dma_weight += pnv_ioda_dma_weight(dev); if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate) pnv_ioda_setup_same_PE(dev->subordinate, pe); } @@ -1005,49 +1037,44 @@ static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe) * subordinate PCI devices and buses. The second type of PE is normally * orgiriated by PCIe-to-PCI bridge or PLX switch downstream ports. */ -static void pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all) +static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all) { struct pci_controller *hose = pci_bus_to_host(bus); struct pnv_phb *phb = hose->private_data; - struct pnv_ioda_pe *pe; - int pe_num = IODA_INVALID_PE; + struct pnv_ioda_pe *pe = NULL; /* Check if PE is determined by M64 */ if (phb->pick_m64_pe) - pe_num = phb->pick_m64_pe(bus, all); + pe = phb->pick_m64_pe(bus, all); /* The PE number isn't pinned by M64 */ - if (pe_num == IODA_INVALID_PE) - pe_num = pnv_ioda_alloc_pe(phb); + if (!pe) + pe = pnv_ioda_alloc_pe(phb); - if (pe_num == IODA_INVALID_PE) { + if (!pe) { pr_warning("%s: Not enough PE# available for PCI bus %04x:%02x\n", __func__, pci_domain_nr(bus), bus->number); - return; + return NULL; } - pe = &phb->ioda.pe_array[pe_num]; pe->flags |= (all ? PNV_IODA_PE_BUS_ALL : PNV_IODA_PE_BUS); pe->pbus = bus; pe->pdev = NULL; - pe->tce32_seg = -1; pe->mve_number = -1; pe->rid = bus->busn_res.start << 8; - pe->dma_weight = 0; if (all) pe_info(pe, "Secondary bus %d..%d associated with PE#%d\n", - bus->busn_res.start, bus->busn_res.end, pe_num); + bus->busn_res.start, bus->busn_res.end, pe->pe_number); else pe_info(pe, "Secondary bus %d associated with PE#%d\n", - bus->busn_res.start, pe_num); + bus->busn_res.start, pe->pe_number); if (pnv_ioda_configure_pe(phb, pe)) { /* XXX What do we do here ? */ - if (pe_num) - pnv_ioda_free_pe(phb, pe_num); + pnv_ioda_free_pe(pe); pe->pbus = NULL; - return; + return NULL; } /* Associate it with all child devices */ @@ -1056,16 +1083,7 @@ static void pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all) /* Put PE to the list */ list_add_tail(&pe->list, &phb->ioda.pe_list); - /* Account for one DMA PE if at least one DMA capable device exist - * below the bridge - */ - if (pe->dma_weight != 0) { - phb->ioda.dma_weight += pe->dma_weight; - phb->ioda.dma_pe_count++; - } - - /* Link the PE */ - pnv_ioda_link_pe_by_weight(phb, pe); + return pe; } static struct pnv_ioda_pe *pnv_ioda_setup_npu_PE(struct pci_dev *npu_pdev) @@ -1088,7 +1106,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_npu_PE(struct pci_dev *npu_pdev) * same GPU get assigned the same PE. */ gpu_pdev = pnv_pci_get_gpu_dev(npu_pdev); - for (pe_num = 0; pe_num < phb->ioda.total_pe; pe_num++) { + for (pe_num = 0; pe_num < phb->ioda.total_pe_num; pe_num++) { pe = &phb->ioda.pe_array[pe_num]; if (!pe->pdev) continue; @@ -1106,7 +1124,6 @@ static struct pnv_ioda_pe *pnv_ioda_setup_npu_PE(struct pci_dev *npu_pdev) rid = npu_pdev->bus->number << 8 | npu_pdn->devfn; npu_pdn->pcidev = npu_pdev; npu_pdn->pe_number = pe_num; - pe->dma_weight += pnv_ioda_dma_weight(npu_pdev); phb->ioda.pe_rmap[rid] = pe->pe_number; /* Map the PE to this link */ @@ -1378,7 +1395,7 @@ static void pnv_ioda_release_vf_PE(struct pci_dev *pdev) pnv_ioda_deconfigure_pe(phb, pe); - pnv_ioda_free_pe(phb, pe->pe_number); + pnv_ioda_free_pe(pe); } } @@ -1387,6 +1404,7 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev) struct pci_bus *bus; struct pci_controller *hose; struct pnv_phb *phb; + struct pnv_ioda_pe *pe; struct pci_dn *pdn; struct pci_sriov *iov; u16 num_vfs, i; @@ -1411,8 +1429,11 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev) /* Release PE numbers */ if (pdn->m64_single_mode) { for (i = 0; i < num_vfs; i++) { - if (pdn->pe_num_map[i] != IODA_INVALID_PE) - pnv_ioda_free_pe(phb, pdn->pe_num_map[i]); + if (pdn->pe_num_map[i] == IODA_INVALID_PE) + continue; + + pe = &phb->ioda.pe_array[pdn->pe_num_map[i]]; + pnv_ioda_free_pe(pe); } } else bitmap_clear(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs); @@ -1454,7 +1475,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs) pe->flags = PNV_IODA_PE_VF; pe->pbus = NULL; pe->parent_dev = pdev; - pe->tce32_seg = -1; pe->mve_number = -1; pe->rid = (pci_iov_virtfn_bus(pdev, vf_index) << 8) | pci_iov_virtfn_devfn(pdev, vf_index); @@ -1466,8 +1486,7 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs) if (pnv_ioda_configure_pe(phb, pe)) { /* XXX What do we do here ? */ - if (pe_num) - pnv_ioda_free_pe(phb, pe_num); + pnv_ioda_free_pe(pe); pe->pdev = NULL; continue; } @@ -1486,6 +1505,7 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs) struct pci_bus *bus; struct pci_controller *hose; struct pnv_phb *phb; + struct pnv_ioda_pe *pe; struct pci_dn *pdn; int ret; u16 i; @@ -1528,18 +1548,20 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs) /* Calculate available PE for required VFs */ if (pdn->m64_single_mode) { for (i = 0; i < num_vfs; i++) { - pdn->pe_num_map[i] = pnv_ioda_alloc_pe(phb); - if (pdn->pe_num_map[i] == IODA_INVALID_PE) { + pe = pnv_ioda_alloc_pe(phb); + if (!pe) { ret = -EBUSY; goto m64_failed; } + + pdn->pe_num_map[i] = pe->pe_number; } } else { mutex_lock(&phb->ioda.pe_alloc_mutex); *pdn->pe_num_map = bitmap_find_next_zero_area( - phb->ioda.pe_alloc, phb->ioda.total_pe, + phb->ioda.pe_alloc, phb->ioda.total_pe_num, 0, num_vfs, 0); - if (*pdn->pe_num_map >= phb->ioda.total_pe) { + if (*pdn->pe_num_map >= phb->ioda.total_pe_num) { mutex_unlock(&phb->ioda.pe_alloc_mutex); dev_info(&pdev->dev, "Failed to enable VF%d\n", num_vfs); kfree(pdn->pe_num_map); @@ -1577,8 +1599,11 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs) m64_failed: if (pdn->m64_single_mode) { for (i = 0; i < num_vfs; i++) { - if (pdn->pe_num_map[i] != IODA_INVALID_PE) - pnv_ioda_free_pe(phb, pdn->pe_num_map[i]); + if (pdn->pe_num_map[i] == IODA_INVALID_PE) + continue; + + pe = &phb->ioda.pe_array[pdn->pe_num_map[i]]; + pnv_ioda_free_pe(pe); } } else bitmap_clear(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs); @@ -1640,8 +1665,6 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask) struct pnv_ioda_pe *pe; uint64_t top; bool bypass = false; - struct pci_dev *linked_npu_dev; - int i; if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) return -ENODEV;; @@ -1662,15 +1685,7 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask) *pdev->dev.dma_mask = dma_mask; /* Update peer npu devices */ - if (pe->flags & PNV_IODA_PE_PEER) - for (i = 0; i < PNV_IODA_MAX_PEER_PES; i++) { - if (!pe->peers[i]) - continue; - - linked_npu_dev = pe->peers[i]->pdev; - if (dma_get_mask(&linked_npu_dev->dev) != dma_mask) - dma_set_mask(&linked_npu_dev->dev, dma_mask); - } + pnv_npu_try_dma_set_bypass(pdev, bypass); return 0; } @@ -1811,28 +1826,34 @@ static struct iommu_table_ops pnv_ioda1_iommu_ops = { .get = pnv_tce_get, }; -static inline void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_ioda_pe *pe) +#define TCE_KILL_INVAL_ALL PPC_BIT(0) +#define TCE_KILL_INVAL_PE PPC_BIT(1) +#define TCE_KILL_INVAL_TCE PPC_BIT(2) + +void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm) +{ + const unsigned long val = TCE_KILL_INVAL_ALL; + + mb(); /* Ensure previous TCE table stores are visible */ + if (rm) + __raw_rm_writeq(cpu_to_be64(val), + (__be64 __iomem *) + phb->ioda.tce_inval_reg_phys); + else + __raw_writeq(cpu_to_be64(val), phb->ioda.tce_inval_reg); +} + +static inline void pnv_pci_ioda2_tce_invalidate_pe(struct pnv_ioda_pe *pe) { /* 01xb - invalidate TCEs that match the specified PE# */ - unsigned long val = (0x4ull << 60) | (pe->pe_number & 0xFF); + unsigned long val = TCE_KILL_INVAL_PE | (pe->pe_number & 0xFF); struct pnv_phb *phb = pe->phb; - struct pnv_ioda_pe *npe; - int i; if (!phb->ioda.tce_inval_reg) return; mb(); /* Ensure above stores are visible */ __raw_writeq(cpu_to_be64(val), phb->ioda.tce_inval_reg); - - if (pe->flags & PNV_IODA_PE_PEER) - for (i = 0; i < PNV_IODA_MAX_PEER_PES; i++) { - npe = pe->peers[i]; - if (!npe || npe->phb->type != PNV_PHB_NPU) - continue; - - pnv_npu_tce_invalidate_entire(npe); - } } static void pnv_pci_ioda2_do_tce_invalidate(unsigned pe_number, bool rm, @@ -1842,7 +1863,7 @@ static void pnv_pci_ioda2_do_tce_invalidate(unsigned pe_number, bool rm, unsigned long start, end, inc; /* We'll invalidate DMA address in PE scope */ - start = 0x2ull << 60; + start = TCE_KILL_INVAL_TCE; start |= (pe_number & 0xFF); end = start; @@ -1867,28 +1888,24 @@ static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl, struct iommu_table_group_link *tgl; list_for_each_entry_rcu(tgl, &tbl->it_group_list, next) { - struct pnv_ioda_pe *npe; struct pnv_ioda_pe *pe = container_of(tgl->table_group, struct pnv_ioda_pe, table_group); __be64 __iomem *invalidate = rm ? (__be64 __iomem *)pe->phb->ioda.tce_inval_reg_phys : pe->phb->ioda.tce_inval_reg; - int i; + if (pe->phb->type == PNV_PHB_NPU) { + /* + * The NVLink hardware does not support TCE kill + * per TCE entry so we have to invalidate + * the entire cache for it. + */ + pnv_pci_ioda2_tce_invalidate_entire(pe->phb, rm); + continue; + } pnv_pci_ioda2_do_tce_invalidate(pe->pe_number, rm, invalidate, tbl->it_page_shift, index, npages); - - if (pe->flags & PNV_IODA_PE_PEER) - /* Invalidate PEs using the same TCE table */ - for (i = 0; i < PNV_IODA_MAX_PEER_PES; i++) { - npe = pe->peers[i]; - if (!npe || npe->phb->type != PNV_PHB_NPU) - continue; - - pnv_npu_tce_invalidate(npe, tbl, index, - npages, rm); - } } } @@ -1945,56 +1962,140 @@ static struct iommu_table_ops pnv_ioda2_iommu_ops = { .free = pnv_ioda2_table_free, }; -static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, - struct pnv_ioda_pe *pe, unsigned int base, - unsigned int segs) +static int pnv_pci_ioda_dev_dma_weight(struct pci_dev *dev, void *data) +{ + unsigned int *weight = (unsigned int *)data; + + /* This is quite simplistic. The "base" weight of a device + * is 10. 0 means no DMA is to be accounted for it. + */ + if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL) + return 0; + + if (dev->class == PCI_CLASS_SERIAL_USB_UHCI || + dev->class == PCI_CLASS_SERIAL_USB_OHCI || + dev->class == PCI_CLASS_SERIAL_USB_EHCI) + *weight += 3; + else if ((dev->class >> 8) == PCI_CLASS_STORAGE_RAID) + *weight += 15; + else + *weight += 10; + + return 0; +} + +static unsigned int pnv_pci_ioda_pe_dma_weight(struct pnv_ioda_pe *pe) +{ + unsigned int weight = 0; + + /* SRIOV VF has same DMA32 weight as its PF */ +#ifdef CONFIG_PCI_IOV + if ((pe->flags & PNV_IODA_PE_VF) && pe->parent_dev) { + pnv_pci_ioda_dev_dma_weight(pe->parent_dev, &weight); + return weight; + } +#endif + + if ((pe->flags & PNV_IODA_PE_DEV) && pe->pdev) { + pnv_pci_ioda_dev_dma_weight(pe->pdev, &weight); + } else if ((pe->flags & PNV_IODA_PE_BUS) && pe->pbus) { + struct pci_dev *pdev; + + list_for_each_entry(pdev, &pe->pbus->devices, bus_list) + pnv_pci_ioda_dev_dma_weight(pdev, &weight); + } else if ((pe->flags & PNV_IODA_PE_BUS_ALL) && pe->pbus) { + pci_walk_bus(pe->pbus, pnv_pci_ioda_dev_dma_weight, &weight); + } + + return weight; +} + +static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb, + struct pnv_ioda_pe *pe) { struct page *tce_mem = NULL; struct iommu_table *tbl; - unsigned int i; + unsigned int weight, total_weight = 0; + unsigned int tce32_segsz, base, segs, avail, i; int64_t rc; void *addr; /* XXX FIXME: Handle 64-bit only DMA devices */ /* XXX FIXME: Provide 64-bit DMA facilities & non-4K TCE tables etc.. */ /* XXX FIXME: Allocate multi-level tables on PHB3 */ - - /* We shouldn't already have a 32-bit DMA associated */ - if (WARN_ON(pe->tce32_seg >= 0)) + weight = pnv_pci_ioda_pe_dma_weight(pe); + if (!weight) return; + pci_walk_bus(phb->hose->bus, pnv_pci_ioda_dev_dma_weight, + &total_weight); + segs = (weight * phb->ioda.dma32_count) / total_weight; + if (!segs) + segs = 1; + + /* + * Allocate contiguous DMA32 segments. We begin with the expected + * number of segments. With one more attempt, the number of DMA32 + * segments to be allocated is decreased by one until one segment + * is allocated successfully. + */ + do { + for (base = 0; base <= phb->ioda.dma32_count - segs; base++) { + for (avail = 0, i = base; i < base + segs; i++) { + if (phb->ioda.dma32_segmap[i] == + IODA_INVALID_PE) + avail++; + } + + if (avail == segs) + goto found; + } + } while (--segs); + + if (!segs) { + pe_warn(pe, "No available DMA32 segments\n"); + return; + } + +found: tbl = pnv_pci_table_alloc(phb->hose->node); iommu_register_group(&pe->table_group, phb->hose->global_number, pe->pe_number); pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group); /* Grab a 32-bit TCE table */ - pe->tce32_seg = base; + pe_info(pe, "DMA weight %d (%d), assigned (%d) %d DMA32 segments\n", + weight, total_weight, base, segs); pe_info(pe, " Setting up 32-bit TCE table at %08x..%08x\n", - (base << 28), ((base + segs) << 28) - 1); + base * PNV_IODA1_DMA32_SEGSIZE, + (base + segs) * PNV_IODA1_DMA32_SEGSIZE - 1); /* XXX Currently, we allocate one big contiguous table for the * TCEs. We only really need one chunk per 256M of TCE space * (ie per segment) but that's an optimization for later, it * requires some added smarts with our get/put_tce implementation + * + * Each TCE page is 4KB in size and each TCE entry occupies 8 + * bytes */ + tce32_segsz = PNV_IODA1_DMA32_SEGSIZE >> (IOMMU_PAGE_SHIFT_4K - 3); tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL, - get_order(TCE32_TABLE_SIZE * segs)); + get_order(tce32_segsz * segs)); if (!tce_mem) { pe_err(pe, " Failed to allocate a 32-bit TCE memory\n"); goto fail; } addr = page_address(tce_mem); - memset(addr, 0, TCE32_TABLE_SIZE * segs); + memset(addr, 0, tce32_segsz * segs); /* Configure HW */ for (i = 0; i < segs; i++) { rc = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number, base + i, 1, - __pa(addr) + TCE32_TABLE_SIZE * i, - TCE32_TABLE_SIZE, 0x1000); + __pa(addr) + tce32_segsz * i, + tce32_segsz, IOMMU_PAGE_SIZE_4K); if (rc) { pe_err(pe, " Failed to configure 32-bit TCE table," " err %ld\n", rc); @@ -2002,9 +2103,14 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, } } + /* Setup DMA32 segment mapping */ + for (i = base; i < base + segs; i++) + phb->ioda.dma32_segmap[i] = pe->pe_number; + /* Setup linux iommu table */ - pnv_pci_setup_iommu_table(tbl, addr, TCE32_TABLE_SIZE * segs, - base << 28, IOMMU_PAGE_SHIFT_4K); + pnv_pci_setup_iommu_table(tbl, addr, tce32_segsz * segs, + base * PNV_IODA1_DMA32_SEGSIZE, + IOMMU_PAGE_SHIFT_4K); /* OPAL variant of P7IOC SW invalidated TCEs */ if (phb->ioda.tce_inval_reg) @@ -2031,10 +2137,8 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, return; fail: /* XXX Failure: Try to fallback to 64-bit only ? */ - if (pe->tce32_seg >= 0) - pe->tce32_seg = -1; if (tce_mem) - __free_pages(tce_mem, get_order(TCE32_TABLE_SIZE * segs)); + __free_pages(tce_mem, get_order(tce32_segsz * segs)); if (tbl) { pnv_pci_unlink_table_and_group(tbl, &pe->table_group); iommu_free_table(tbl, "pnv"); @@ -2075,7 +2179,7 @@ static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group, pnv_pci_link_table_and_group(phb->hose->node, num, tbl, &pe->table_group); - pnv_pci_ioda2_tce_invalidate_entire(pe); + pnv_pci_ioda2_tce_invalidate_pe(pe); return 0; } @@ -2219,7 +2323,7 @@ static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group, if (ret) pe_warn(pe, "Unmapping failed, ret = %ld\n", ret); else - pnv_pci_ioda2_tce_invalidate_entire(pe); + pnv_pci_ioda2_tce_invalidate_pe(pe); pnv_pci_unlink_table_and_group(table_group->tables[num], table_group); @@ -2288,6 +2392,116 @@ static struct iommu_table_group_ops pnv_pci_ioda2_ops = { .take_ownership = pnv_ioda2_take_ownership, .release_ownership = pnv_ioda2_release_ownership, }; + +static int gpe_table_group_to_npe_cb(struct device *dev, void *opaque) +{ + struct pci_controller *hose; + struct pnv_phb *phb; + struct pnv_ioda_pe **ptmppe = opaque; + struct pci_dev *pdev = container_of(dev, struct pci_dev, dev); + struct pci_dn *pdn = pci_get_pdn(pdev); + + if (!pdn || pdn->pe_number == IODA_INVALID_PE) + return 0; + + hose = pci_bus_to_host(pdev->bus); + phb = hose->private_data; + if (phb->type != PNV_PHB_NPU) + return 0; + + *ptmppe = &phb->ioda.pe_array[pdn->pe_number]; + + return 1; +} + +/* + * This returns PE of associated NPU. + * This assumes that NPU is in the same IOMMU group with GPU and there is + * no other PEs. + */ +static struct pnv_ioda_pe *gpe_table_group_to_npe( + struct iommu_table_group *table_group) +{ + struct pnv_ioda_pe *npe = NULL; + int ret = iommu_group_for_each_dev(table_group->group, &npe, + gpe_table_group_to_npe_cb); + + BUG_ON(!ret || !npe); + + return npe; +} + +static long pnv_pci_ioda2_npu_set_window(struct iommu_table_group *table_group, + int num, struct iommu_table *tbl) +{ + long ret = pnv_pci_ioda2_set_window(table_group, num, tbl); + + if (ret) + return ret; + + ret = pnv_npu_set_window(gpe_table_group_to_npe(table_group), num, tbl); + if (ret) + pnv_pci_ioda2_unset_window(table_group, num); + + return ret; +} + +static long pnv_pci_ioda2_npu_unset_window( + struct iommu_table_group *table_group, + int num) +{ + long ret = pnv_pci_ioda2_unset_window(table_group, num); + + if (ret) + return ret; + + return pnv_npu_unset_window(gpe_table_group_to_npe(table_group), num); +} + +static void pnv_ioda2_npu_take_ownership(struct iommu_table_group *table_group) +{ + /* + * Detach NPU first as pnv_ioda2_take_ownership() will destroy + * the iommu_table if 32bit DMA is enabled. + */ + pnv_npu_take_ownership(gpe_table_group_to_npe(table_group)); + pnv_ioda2_take_ownership(table_group); +} + +static struct iommu_table_group_ops pnv_pci_ioda2_npu_ops = { + .get_table_size = pnv_pci_ioda2_get_table_size, + .create_table = pnv_pci_ioda2_create_table, + .set_window = pnv_pci_ioda2_npu_set_window, + .unset_window = pnv_pci_ioda2_npu_unset_window, + .take_ownership = pnv_ioda2_npu_take_ownership, + .release_ownership = pnv_ioda2_release_ownership, +}; + +static void pnv_pci_ioda_setup_iommu_api(void) +{ + struct pci_controller *hose, *tmp; + struct pnv_phb *phb; + struct pnv_ioda_pe *pe, *gpe; + + /* + * Now we have all PHBs discovered, time to add NPU devices to + * the corresponding IOMMU groups. + */ + list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { + phb = hose->private_data; + + if (phb->type != PNV_PHB_NPU) + continue; + + list_for_each_entry(pe, &phb->ioda.pe_list, list) { + gpe = pnv_pci_npu_setup_iommu(pe); + if (gpe) + gpe->table_group.ops = &pnv_pci_ioda2_npu_ops; + } + } +} +#else /* !CONFIG_IOMMU_API */ +static void pnv_pci_ioda_setup_iommu_api(void) { }; #endif static void pnv_pci_ioda_setup_opal_tce_kill(struct pnv_phb *phb) @@ -2443,10 +2657,6 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, { int64_t rc; - /* We shouldn't already have a 32-bit DMA associated */ - if (WARN_ON(pe->tce32_seg >= 0)) - return; - /* TVE #1 is selected by PCI address bit 59 */ pe->tce_bypass_base = 1ull << 59; @@ -2454,7 +2664,6 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, pe->pe_number); /* The PE will reserve all possible 32-bits space */ - pe->tce32_seg = 0; pe_info(pe, "Setting up 32-bit TCE table at 0..%08x\n", phb->ioda.m32_pci_base); @@ -2470,11 +2679,8 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, #endif rc = pnv_pci_ioda2_setup_default_config(pe); - if (rc) { - if (pe->tce32_seg >= 0) - pe->tce32_seg = -1; + if (rc) return; - } if (pe->flags & PNV_IODA_PE_DEV) iommu_add_device(&pe->pdev->dev); @@ -2485,47 +2691,24 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, static void pnv_ioda_setup_dma(struct pnv_phb *phb) { struct pci_controller *hose = phb->hose; - unsigned int residual, remaining, segs, tw, base; struct pnv_ioda_pe *pe; + unsigned int weight; /* If we have more PE# than segments available, hand out one * per PE until we run out and let the rest fail. If not, * then we assign at least one segment per PE, plus more based * on the amount of devices under that PE */ - if (phb->ioda.dma_pe_count > phb->ioda.tce32_count) - residual = 0; - else - residual = phb->ioda.tce32_count - - phb->ioda.dma_pe_count; - - pr_info("PCI: Domain %04x has %ld available 32-bit DMA segments\n", - hose->global_number, phb->ioda.tce32_count); - pr_info("PCI: %d PE# for a total weight of %d\n", - phb->ioda.dma_pe_count, phb->ioda.dma_weight); + pr_info("PCI: Domain %04x has %d available 32-bit DMA segments\n", + hose->global_number, phb->ioda.dma32_count); pnv_pci_ioda_setup_opal_tce_kill(phb); - /* Walk our PE list and configure their DMA segments, hand them - * out one base segment plus any residual segments based on - * weight - */ - remaining = phb->ioda.tce32_count; - tw = phb->ioda.dma_weight; - base = 0; - list_for_each_entry(pe, &phb->ioda.pe_dma_list, dma_link) { - if (!pe->dma_weight) + /* Walk our PE list and configure their DMA segments */ + list_for_each_entry(pe, &phb->ioda.pe_list, list) { + weight = pnv_pci_ioda_pe_dma_weight(pe); + if (!weight) continue; - if (!remaining) { - pe_warn(pe, "No DMA32 resources available\n"); - continue; - } - segs = 1; - if (residual) { - segs += ((pe->dma_weight * residual) + (tw / 2)) / tw; - if (segs > remaining) - segs = remaining; - } /* * For IODA2 compliant PHB3, we needn't care about the weight. @@ -2533,12 +2716,9 @@ static void pnv_ioda_setup_dma(struct pnv_phb *phb) * the specific PE. */ if (phb->type == PNV_PHB_IODA1) { - pe_info(pe, "DMA weight %d, assigned %d DMA32 segments\n", - pe->dma_weight, segs); - pnv_pci_ioda_setup_dma_pe(phb, pe, base, segs); + pnv_pci_ioda1_setup_dma_pe(phb, pe); } else if (phb->type == PNV_PHB_IODA2) { pe_info(pe, "Assign DMA32 space\n"); - segs = 0; pnv_pci_ioda2_setup_dma_pe(phb, pe); } else if (phb->type == PNV_PHB_NPU) { /* @@ -2548,9 +2728,6 @@ static void pnv_ioda_setup_dma(struct pnv_phb *phb) * as the PHB3 TVT. */ } - - remaining -= segs; - base += segs; } } @@ -2858,7 +3035,7 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev) pdn->m64_single_mode = false; total_vfs = pci_sriov_get_totalvfs(pdev); - mul = phb->ioda.total_pe; + mul = phb->ioda.total_pe_num; total_vf_bar_sz = 0; for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { @@ -2929,19 +3106,72 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev) } #endif /* CONFIG_PCI_IOV */ +static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe, + struct resource *res) +{ + struct pnv_phb *phb = pe->phb; + struct pci_bus_region region; + int index; + int64_t rc; + + if (!res || !res->flags || res->start > res->end) + return; + + if (res->flags & IORESOURCE_IO) { + region.start = res->start - phb->ioda.io_pci_base; + region.end = res->end - phb->ioda.io_pci_base; + index = region.start / phb->ioda.io_segsize; + + while (index < phb->ioda.total_pe_num && + region.start <= region.end) { + phb->ioda.io_segmap[index] = pe->pe_number; + rc = opal_pci_map_pe_mmio_window(phb->opal_id, + pe->pe_number, OPAL_IO_WINDOW_TYPE, 0, index); + if (rc != OPAL_SUCCESS) { + pr_err("%s: Error %lld mapping IO segment#%d to PE#%d\n", + __func__, rc, index, pe->pe_number); + break; + } + + region.start += phb->ioda.io_segsize; + index++; + } + } else if ((res->flags & IORESOURCE_MEM) && + !pnv_pci_is_mem_pref_64(res->flags)) { + region.start = res->start - + phb->hose->mem_offset[0] - + phb->ioda.m32_pci_base; + region.end = res->end - + phb->hose->mem_offset[0] - + phb->ioda.m32_pci_base; + index = region.start / phb->ioda.m32_segsize; + + while (index < phb->ioda.total_pe_num && + region.start <= region.end) { + phb->ioda.m32_segmap[index] = pe->pe_number; + rc = opal_pci_map_pe_mmio_window(phb->opal_id, + pe->pe_number, OPAL_M32_WINDOW_TYPE, 0, index); + if (rc != OPAL_SUCCESS) { + pr_err("%s: Error %lld mapping M32 segment#%d to PE#%d", + __func__, rc, index, pe->pe_number); + break; + } + + region.start += phb->ioda.m32_segsize; + index++; + } + } +} + /* * This function is supposed to be called on basis of PE from top * to bottom style. So the the I/O or MMIO segment assigned to * parent PE could be overrided by its child PEs if necessary. */ -static void pnv_ioda_setup_pe_seg(struct pci_controller *hose, - struct pnv_ioda_pe *pe) +static void pnv_ioda_setup_pe_seg(struct pnv_ioda_pe *pe) { - struct pnv_phb *phb = hose->private_data; - struct pci_bus_region region; - struct resource *res; - int i, index; - int rc; + struct pci_dev *pdev; + int i; /* * NOTE: We only care PCI bus based PE for now. For PCI @@ -2950,57 +3180,20 @@ static void pnv_ioda_setup_pe_seg(struct pci_controller *hose, */ BUG_ON(!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))); - pci_bus_for_each_resource(pe->pbus, res, i) { - if (!res || !res->flags || - res->start > res->end) + list_for_each_entry(pdev, &pe->pbus->devices, bus_list) { + for (i = 0; i <= PCI_ROM_RESOURCE; i++) + pnv_ioda_setup_pe_res(pe, &pdev->resource[i]); + + /* + * If the PE contains all subordinate PCI buses, the + * windows of the child bridges should be mapped to + * the PE as well. + */ + if (!(pe->flags & PNV_IODA_PE_BUS_ALL) || !pci_is_bridge(pdev)) continue; - - if (res->flags & IORESOURCE_IO) { - region.start = res->start - phb->ioda.io_pci_base; - region.end = res->end - phb->ioda.io_pci_base; - index = region.start / phb->ioda.io_segsize; - - while (index < phb->ioda.total_pe && - region.start <= region.end) { - phb->ioda.io_segmap[index] = pe->pe_number; - rc = opal_pci_map_pe_mmio_window(phb->opal_id, - pe->pe_number, OPAL_IO_WINDOW_TYPE, 0, index); - if (rc != OPAL_SUCCESS) { - pr_err("%s: OPAL error %d when mapping IO " - "segment #%d to PE#%d\n", - __func__, rc, index, pe->pe_number); - break; - } - - region.start += phb->ioda.io_segsize; - index++; - } - } else if ((res->flags & IORESOURCE_MEM) && - !pnv_pci_is_mem_pref_64(res->flags)) { - region.start = res->start - - hose->mem_offset[0] - - phb->ioda.m32_pci_base; - region.end = res->end - - hose->mem_offset[0] - - phb->ioda.m32_pci_base; - index = region.start / phb->ioda.m32_segsize; - - while (index < phb->ioda.total_pe && - region.start <= region.end) { - phb->ioda.m32_segmap[index] = pe->pe_number; - rc = opal_pci_map_pe_mmio_window(phb->opal_id, - pe->pe_number, OPAL_M32_WINDOW_TYPE, 0, index); - if (rc != OPAL_SUCCESS) { - pr_err("%s: OPAL error %d when mapping M32 " - "segment#%d to PE#%d", - __func__, rc, index, pe->pe_number); - break; - } - - region.start += phb->ioda.m32_segsize; - index++; - } - } + for (i = 0; i < PCI_BRIDGE_RESOURCE_NUM; i++) + pnv_ioda_setup_pe_res(pe, + &pdev->resource[PCI_BRIDGE_RESOURCES + i]); } } @@ -3018,7 +3211,7 @@ static void pnv_pci_ioda_setup_seg(void) continue; list_for_each_entry(pe, &phb->ioda.pe_list, list) { - pnv_ioda_setup_pe_seg(hose, pe); + pnv_ioda_setup_pe_seg(pe); } } } @@ -3035,6 +3228,8 @@ static void pnv_pci_ioda_setup_DMA(void) phb = hose->private_data; phb->initialized = 1; } + + pnv_pci_ioda_setup_iommu_api(); } static void pnv_pci_ioda_create_dbgfs(void) @@ -3056,27 +3251,6 @@ static void pnv_pci_ioda_create_dbgfs(void) #endif /* CONFIG_DEBUG_FS */ } -static void pnv_npu_ioda_fixup(void) -{ - bool enable_bypass; - struct pci_controller *hose, *tmp; - struct pnv_phb *phb; - struct pnv_ioda_pe *pe; - - list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { - phb = hose->private_data; - if (phb->type != PNV_PHB_NPU) - continue; - - list_for_each_entry(pe, &phb->ioda.pe_dma_list, dma_link) { - enable_bypass = dma_get_mask(&pe->pdev->dev) == - DMA_BIT_MASK(64); - pnv_npu_init_dma_pe(pe); - pnv_npu_dma_set_bypass(pe, enable_bypass); - } - } -} - static void pnv_pci_ioda_fixup(void) { pnv_pci_ioda_setup_PEs(); @@ -3089,9 +3263,6 @@ static void pnv_pci_ioda_fixup(void) eeh_init(); eeh_addr_cache_build(); #endif - - /* Link NPU IODA tables to their PCI devices. */ - pnv_npu_ioda_fixup(); } /* @@ -3195,12 +3366,6 @@ static bool pnv_pci_enable_device_hook(struct pci_dev *dev) return true; } -static u32 pnv_ioda_bdfn_to_pe(struct pnv_phb *phb, struct pci_bus *bus, - u32 devfn) -{ - return phb->ioda.pe_rmap[(bus->number << 8) | devfn]; -} - static void pnv_pci_ioda_shutdown(struct pci_controller *hose) { struct pnv_phb *phb = hose->private_data; @@ -3210,31 +3375,39 @@ static void pnv_pci_ioda_shutdown(struct pci_controller *hose) } static const struct pci_controller_ops pnv_pci_ioda_controller_ops = { - .dma_dev_setup = pnv_pci_dma_dev_setup, - .dma_bus_setup = pnv_pci_dma_bus_setup, + .dma_dev_setup = pnv_pci_dma_dev_setup, + .dma_bus_setup = pnv_pci_dma_bus_setup, #ifdef CONFIG_PCI_MSI - .setup_msi_irqs = pnv_setup_msi_irqs, - .teardown_msi_irqs = pnv_teardown_msi_irqs, + .setup_msi_irqs = pnv_setup_msi_irqs, + .teardown_msi_irqs = pnv_teardown_msi_irqs, #endif - .enable_device_hook = pnv_pci_enable_device_hook, - .window_alignment = pnv_pci_window_alignment, - .reset_secondary_bus = pnv_pci_reset_secondary_bus, - .dma_set_mask = pnv_pci_ioda_dma_set_mask, - .dma_get_required_mask = pnv_pci_ioda_dma_get_required_mask, - .shutdown = pnv_pci_ioda_shutdown, + .enable_device_hook = pnv_pci_enable_device_hook, + .window_alignment = pnv_pci_window_alignment, + .reset_secondary_bus = pnv_pci_reset_secondary_bus, + .dma_set_mask = pnv_pci_ioda_dma_set_mask, + .dma_get_required_mask = pnv_pci_ioda_dma_get_required_mask, + .shutdown = pnv_pci_ioda_shutdown, }; +static int pnv_npu_dma_set_mask(struct pci_dev *npdev, u64 dma_mask) +{ + dev_err_once(&npdev->dev, + "%s operation unsupported for NVLink devices\n", + __func__); + return -EPERM; +} + static const struct pci_controller_ops pnv_npu_ioda_controller_ops = { - .dma_dev_setup = pnv_pci_dma_dev_setup, + .dma_dev_setup = pnv_pci_dma_dev_setup, #ifdef CONFIG_PCI_MSI - .setup_msi_irqs = pnv_setup_msi_irqs, - .teardown_msi_irqs = pnv_teardown_msi_irqs, + .setup_msi_irqs = pnv_setup_msi_irqs, + .teardown_msi_irqs = pnv_teardown_msi_irqs, #endif - .enable_device_hook = pnv_pci_enable_device_hook, - .window_alignment = pnv_pci_window_alignment, - .reset_secondary_bus = pnv_pci_reset_secondary_bus, - .dma_set_mask = pnv_npu_dma_set_mask, - .shutdown = pnv_pci_ioda_shutdown, + .enable_device_hook = pnv_pci_enable_device_hook, + .window_alignment = pnv_pci_window_alignment, + .reset_secondary_bus = pnv_pci_reset_secondary_bus, + .dma_set_mask = pnv_npu_dma_set_mask, + .shutdown = pnv_pci_ioda_shutdown, }; static void __init pnv_pci_init_ioda_phb(struct device_node *np, @@ -3242,10 +3415,12 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, { struct pci_controller *hose; struct pnv_phb *phb; - unsigned long size, m32map_off, pemap_off, iomap_off = 0; + unsigned long size, m64map_off, m32map_off, pemap_off; + unsigned long iomap_off = 0, dma32map_off = 0; const __be64 *prop64; const __be32 *prop32; int len; + unsigned int segno; u64 phb_id; void *aux; long rc; @@ -3306,13 +3481,13 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, pr_err(" Failed to map registers !\n"); /* Initialize more IODA stuff */ - phb->ioda.total_pe = 1; + phb->ioda.total_pe_num = 1; prop32 = of_get_property(np, "ibm,opal-num-pes", NULL); if (prop32) - phb->ioda.total_pe = be32_to_cpup(prop32); + phb->ioda.total_pe_num = be32_to_cpup(prop32); prop32 = of_get_property(np, "ibm,opal-reserved-pe", NULL); if (prop32) - phb->ioda.reserved_pe = be32_to_cpup(prop32); + phb->ioda.reserved_pe_idx = be32_to_cpup(prop32); /* Parse 64-bit MMIO range */ pnv_ioda_parse_m64_window(phb); @@ -3321,36 +3496,58 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, /* FW Has already off top 64k of M32 space (MSI space) */ phb->ioda.m32_size += 0x10000; - phb->ioda.m32_segsize = phb->ioda.m32_size / phb->ioda.total_pe; + phb->ioda.m32_segsize = phb->ioda.m32_size / phb->ioda.total_pe_num; phb->ioda.m32_pci_base = hose->mem_resources[0].start - hose->mem_offset[0]; phb->ioda.io_size = hose->pci_io_size; - phb->ioda.io_segsize = phb->ioda.io_size / phb->ioda.total_pe; + phb->ioda.io_segsize = phb->ioda.io_size / phb->ioda.total_pe_num; phb->ioda.io_pci_base = 0; /* XXX calculate this ? */ + /* Calculate how many 32-bit TCE segments we have */ + phb->ioda.dma32_count = phb->ioda.m32_pci_base / + PNV_IODA1_DMA32_SEGSIZE; + /* Allocate aux data & arrays. We don't have IO ports on PHB3 */ - size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long)); + size = _ALIGN_UP(max_t(unsigned, phb->ioda.total_pe_num, 8) / 8, + sizeof(unsigned long)); + m64map_off = size; + size += phb->ioda.total_pe_num * sizeof(phb->ioda.m64_segmap[0]); m32map_off = size; - size += phb->ioda.total_pe * sizeof(phb->ioda.m32_segmap[0]); + size += phb->ioda.total_pe_num * sizeof(phb->ioda.m32_segmap[0]); if (phb->type == PNV_PHB_IODA1) { iomap_off = size; - size += phb->ioda.total_pe * sizeof(phb->ioda.io_segmap[0]); + size += phb->ioda.total_pe_num * sizeof(phb->ioda.io_segmap[0]); + dma32map_off = size; + size += phb->ioda.dma32_count * + sizeof(phb->ioda.dma32_segmap[0]); } pemap_off = size; - size += phb->ioda.total_pe * sizeof(struct pnv_ioda_pe); + size += phb->ioda.total_pe_num * sizeof(struct pnv_ioda_pe); aux = memblock_virt_alloc(size, 0); phb->ioda.pe_alloc = aux; + phb->ioda.m64_segmap = aux + m64map_off; phb->ioda.m32_segmap = aux + m32map_off; - if (phb->type == PNV_PHB_IODA1) + for (segno = 0; segno < phb->ioda.total_pe_num; segno++) { + phb->ioda.m64_segmap[segno] = IODA_INVALID_PE; + phb->ioda.m32_segmap[segno] = IODA_INVALID_PE; + } + if (phb->type == PNV_PHB_IODA1) { phb->ioda.io_segmap = aux + iomap_off; - phb->ioda.pe_array = aux + pemap_off; - set_bit(phb->ioda.reserved_pe, phb->ioda.pe_alloc); + for (segno = 0; segno < phb->ioda.total_pe_num; segno++) + phb->ioda.io_segmap[segno] = IODA_INVALID_PE; + + phb->ioda.dma32_segmap = aux + dma32map_off; + for (segno = 0; segno < phb->ioda.dma32_count; segno++) + phb->ioda.dma32_segmap[segno] = IODA_INVALID_PE; + } + phb->ioda.pe_array = aux + pemap_off; + set_bit(phb->ioda.reserved_pe_idx, phb->ioda.pe_alloc); - INIT_LIST_HEAD(&phb->ioda.pe_dma_list); INIT_LIST_HEAD(&phb->ioda.pe_list); mutex_init(&phb->ioda.pe_list_mutex); /* Calculate how many 32-bit TCE segments we have */ - phb->ioda.tce32_count = phb->ioda.m32_pci_base >> 28; + phb->ioda.dma32_count = phb->ioda.m32_pci_base / + PNV_IODA1_DMA32_SEGSIZE; #if 0 /* We should really do that ... */ rc = opal_pci_set_phb_mem_window(opal->phb_id, @@ -3362,7 +3559,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, #endif pr_info(" %03d (%03d) PE's M32: 0x%x [segment=0x%x]\n", - phb->ioda.total_pe, phb->ioda.reserved_pe, + phb->ioda.total_pe_num, phb->ioda.reserved_pe_idx, phb->ioda.m32_size, phb->ioda.m32_segsize); if (phb->ioda.m64_size) pr_info(" M64: 0x%lx [segment=0x%lx]\n", @@ -3377,12 +3574,6 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, phb->freeze_pe = pnv_ioda_freeze_pe; phb->unfreeze_pe = pnv_ioda_unfreeze_pe; - /* Setup RID -> PE mapping function */ - phb->bdfn_to_pe = pnv_ioda_bdfn_to_pe; - - /* Setup TCEs */ - phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup; - /* Setup MSI support */ pnv_pci_init_ioda_msis(phb); @@ -3395,10 +3586,12 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, */ ppc_md.pcibios_fixup = pnv_pci_ioda_fixup; - if (phb->type == PNV_PHB_NPU) + if (phb->type == PNV_PHB_NPU) { hose->controller_ops = pnv_npu_ioda_controller_ops; - else + } else { + phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup; hose->controller_ops = pnv_pci_ioda_controller_ops; + } #ifdef CONFIG_PCI_IOV ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_iov_resources; diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 73c8dc2a353f..1d92bd93bcd9 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -39,9 +39,6 @@ /* Delay in usec */ #define PCI_RESET_DELAY_US 3000000 -#define cfg_dbg(fmt...) do { } while(0) -//#define cfg_dbg(fmt...) printk(fmt) - #ifdef CONFIG_PCI_MSI int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) { @@ -370,7 +367,7 @@ static void pnv_pci_config_check_eeh(struct pci_dn *pdn) struct pnv_phb *phb = pdn->phb->private_data; u8 fstate; __be16 pcierr; - int pe_no; + unsigned int pe_no; s64 rc; /* @@ -380,7 +377,7 @@ static void pnv_pci_config_check_eeh(struct pci_dn *pdn) */ pe_no = pdn->pe_number; if (pe_no == IODA_INVALID_PE) { - pe_no = phb->ioda.reserved_pe; + pe_no = phb->ioda.reserved_pe_idx; } /* @@ -402,8 +399,8 @@ static void pnv_pci_config_check_eeh(struct pci_dn *pdn) } } - cfg_dbg(" -> EEH check, bdfn=%04x PE#%d fstate=%x\n", - (pdn->busno << 8) | (pdn->devfn), pe_no, fstate); + pr_devel(" -> EEH check, bdfn=%04x PE#%d fstate=%x\n", + (pdn->busno << 8) | (pdn->devfn), pe_no, fstate); /* Clear the frozen state if applicable */ if (fstate == OPAL_EEH_STOPPED_MMIO_FREEZE || @@ -451,8 +448,8 @@ int pnv_pci_cfg_read(struct pci_dn *pdn, return PCIBIOS_FUNC_NOT_SUPPORTED; } - cfg_dbg("%s: bus: %x devfn: %x +%x/%x -> %08x\n", - __func__, pdn->busno, pdn->devfn, where, size, *val); + pr_devel("%s: bus: %x devfn: %x +%x/%x -> %08x\n", + __func__, pdn->busno, pdn->devfn, where, size, *val); return PCIBIOS_SUCCESSFUL; } @@ -462,8 +459,8 @@ int pnv_pci_cfg_write(struct pci_dn *pdn, struct pnv_phb *phb = pdn->phb->private_data; u32 bdfn = (pdn->busno << 8) | pdn->devfn; - cfg_dbg("%s: bus: %x devfn: %x +%x/%x -> %08x\n", - pdn->busno, pdn->devfn, where, size, val); + pr_devel("%s: bus: %x devfn: %x +%x/%x -> %08x\n", + __func__, pdn->busno, pdn->devfn, where, size, val); switch (size) { case 1: opal_pci_config_write_byte(phb->opal_id, bdfn, where, val); diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 3f814f382b2e..7dee25e304db 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -24,7 +24,6 @@ enum pnv_phb_model { #define PNV_IODA_PE_MASTER (1 << 3) /* Master PE in compound case */ #define PNV_IODA_PE_SLAVE (1 << 4) /* Slave PE in compound case */ #define PNV_IODA_PE_VF (1 << 5) /* PE for one VF */ -#define PNV_IODA_PE_PEER (1 << 6) /* PE has peers */ /* Data associated with a PE, including IOMMU tracking etc.. */ struct pnv_phb; @@ -32,9 +31,6 @@ struct pnv_ioda_pe { unsigned long flags; struct pnv_phb *phb; -#define PNV_IODA_MAX_PEER_PES 8 - struct pnv_ioda_pe *peers[PNV_IODA_MAX_PEER_PES]; - /* A PE can be associated with a single device or an * entire bus (& children). In the former case, pdev * is populated, in the later case, pbus is. @@ -53,14 +49,7 @@ struct pnv_ioda_pe { /* PE number */ unsigned int pe_number; - /* "Weight" assigned to the PE for the sake of DMA resource - * allocations - */ - unsigned int dma_weight; - /* "Base" iommu table, ie, 4K TCEs, 32-bit DMA */ - int tce32_seg; - int tce32_segcount; struct iommu_table_group table_group; /* 64-bit TCE bypass region */ @@ -78,7 +67,6 @@ struct pnv_ioda_pe { struct list_head slaves; /* Link in list of PE#s */ - struct list_head dma_link; struct list_head list; }; @@ -110,19 +98,18 @@ struct pnv_phb { unsigned int is_64, struct msi_msg *msg); void (*dma_dev_setup)(struct pnv_phb *phb, struct pci_dev *pdev); void (*fixup_phb)(struct pci_controller *hose); - u32 (*bdfn_to_pe)(struct pnv_phb *phb, struct pci_bus *bus, u32 devfn); int (*init_m64)(struct pnv_phb *phb); void (*reserve_m64_pe)(struct pci_bus *bus, unsigned long *pe_bitmap, bool all); - int (*pick_m64_pe)(struct pci_bus *bus, bool all); + struct pnv_ioda_pe *(*pick_m64_pe)(struct pci_bus *bus, bool all); int (*get_pe_state)(struct pnv_phb *phb, int pe_no); void (*freeze_pe)(struct pnv_phb *phb, int pe_no); int (*unfreeze_pe)(struct pnv_phb *phb, int pe_no, int opt); struct { /* Global bridge info */ - unsigned int total_pe; - unsigned int reserved_pe; + unsigned int total_pe_num; + unsigned int reserved_pe_idx; /* 32-bit MMIO window */ unsigned int m32_size; @@ -141,15 +128,19 @@ struct pnv_phb { unsigned int io_segsize; unsigned int io_pci_base; - /* PE allocation bitmap */ - unsigned long *pe_alloc; - /* PE allocation mutex */ + /* PE allocation */ struct mutex pe_alloc_mutex; + unsigned long *pe_alloc; + struct pnv_ioda_pe *pe_array; /* M32 & IO segment maps */ + unsigned int *m64_segmap; unsigned int *m32_segmap; unsigned int *io_segmap; - struct pnv_ioda_pe *pe_array; + + /* DMA32 segment maps - IODA1 only */ + unsigned int dma32_count; + unsigned int *dma32_segmap; /* IRQ chip */ int irq_chip_init; @@ -167,20 +158,6 @@ struct pnv_phb { */ unsigned char pe_rmap[0x10000]; - /* 32-bit TCE tables allocation */ - unsigned long tce32_count; - - /* Total "weight" for the sake of DMA resources - * allocation - */ - unsigned int dma_weight; - unsigned int dma_pe_count; - - /* Sorted list of used PE's, sorted at - * boot for resource allocation purposes - */ - struct list_head pe_dma_list; - /* TCE cache invalidate registers (physical and * remapped) */ @@ -236,16 +213,23 @@ extern void pnv_pci_dma_bus_setup(struct pci_bus *bus); extern int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type); extern void pnv_teardown_msi_irqs(struct pci_dev *pdev); +extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, + const char *fmt, ...); +#define pe_err(pe, fmt, ...) \ + pe_level_printk(pe, KERN_ERR, fmt, ##__VA_ARGS__) +#define pe_warn(pe, fmt, ...) \ + pe_level_printk(pe, KERN_WARNING, fmt, ##__VA_ARGS__) +#define pe_info(pe, fmt, ...) \ + pe_level_printk(pe, KERN_INFO, fmt, ##__VA_ARGS__) + /* Nvlink functions */ -extern void pnv_npu_tce_invalidate_entire(struct pnv_ioda_pe *npe); -extern void pnv_npu_tce_invalidate(struct pnv_ioda_pe *npe, - struct iommu_table *tbl, - unsigned long index, - unsigned long npages, - bool rm); -extern void pnv_npu_init_dma_pe(struct pnv_ioda_pe *npe); -extern void pnv_npu_setup_dma_pe(struct pnv_ioda_pe *npe); -extern int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe, bool enabled); -extern int pnv_npu_dma_set_mask(struct pci_dev *npdev, u64 dma_mask); +extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass); +extern void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm); +extern struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe); +extern long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num, + struct iommu_table *tbl); +extern long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num); +extern void pnv_npu_take_ownership(struct pnv_ioda_pe *npe); +extern void pnv_npu_release_ownership(struct pnv_ioda_pe *npe); #endif /* __POWERNV_PCI_H */ diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index 1acb0c72d923..ee6430bedcc3 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -273,7 +273,10 @@ static int __init pnv_probe(void) if (!of_flat_dt_is_compatible(root, "ibm,powernv")) return 0; - hpte_init_native(); + if (IS_ENABLED(CONFIG_PPC_RADIX_MMU) && radix_enabled()) + radix_init_native(); + else if (IS_ENABLED(CONFIG_PPC_STD_MMU_64)) + hpte_init_native(); if (firmware_has_feature(FW_FEATURE_OPAL)) pnv_setup_machdep_opal(); diff --git a/arch/powerpc/platforms/ps3/htab.c b/arch/powerpc/platforms/ps3/htab.c index 2f95d33cf34a..c9a3e677192a 100644 --- a/arch/powerpc/platforms/ps3/htab.c +++ b/arch/powerpc/platforms/ps3/htab.c @@ -63,7 +63,7 @@ static long ps3_hpte_insert(unsigned long hpte_group, unsigned long vpn, vflags &= ~HPTE_V_SECONDARY; hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID; - hpte_r = hpte_encode_r(ps3_mm_phys_to_lpar(pa), psize, apsize) | rflags; + hpte_r = hpte_encode_r(ps3_mm_phys_to_lpar(pa), psize, apsize, ssize) | rflags; spin_lock_irqsave(&ps3_htab_lock, flags); diff --git a/arch/powerpc/platforms/ps3/spu.c b/arch/powerpc/platforms/ps3/spu.c index a0bca05e26b0..492b2575e0d2 100644 --- a/arch/powerpc/platforms/ps3/spu.c +++ b/arch/powerpc/platforms/ps3/spu.c @@ -205,7 +205,7 @@ static void spu_unmap(struct spu *spu) static int __init setup_areas(struct spu *spu) { struct table {char* name; unsigned long addr; unsigned long size;}; - static const unsigned long shadow_flags = _PAGE_NO_CACHE | 3; + unsigned long shadow_flags = pgprot_val(pgprot_noncached_wc(PAGE_KERNEL_RO)); spu_pdata(spu)->shadow = __ioremap(spu_pdata(spu)->shadow_addr, sizeof(struct spe_shadow), @@ -216,7 +216,7 @@ static int __init setup_areas(struct spu *spu) } spu->local_store = (__force void *)ioremap_prot(spu->local_store_phys, - LS_SIZE, _PAGE_NO_CACHE); + LS_SIZE, pgprot_val(pgprot_noncached_wc(__pgprot(0)))); if (!spu->local_store) { pr_debug("%s:%d: ioremap local_store failed\n", diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index e9ff44cd5d86..2ce138542083 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -116,6 +116,155 @@ static struct property *dlpar_clone_drconf_property(struct device_node *dn) return new_prop; } +static void dlpar_update_drconf_property(struct device_node *dn, + struct property *prop) +{ + struct of_drconf_cell *lmbs; + u32 num_lmbs, *p; + int i; + + /* Convert the property back to BE */ + p = prop->value; + num_lmbs = *p; + *p = cpu_to_be32(*p); + p++; + + lmbs = (struct of_drconf_cell *)p; + for (i = 0; i < num_lmbs; i++) { + lmbs[i].base_addr = cpu_to_be64(lmbs[i].base_addr); + lmbs[i].drc_index = cpu_to_be32(lmbs[i].drc_index); + lmbs[i].flags = cpu_to_be32(lmbs[i].flags); + } + + rtas_hp_event = true; + of_update_property(dn, prop); + rtas_hp_event = false; +} + +static int dlpar_update_device_tree_lmb(struct of_drconf_cell *lmb) +{ + struct device_node *dn; + struct property *prop; + struct of_drconf_cell *lmbs; + u32 *p, num_lmbs; + int i; + + dn = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); + if (!dn) + return -ENODEV; + + prop = dlpar_clone_drconf_property(dn); + if (!prop) { + of_node_put(dn); + return -ENODEV; + } + + p = prop->value; + num_lmbs = *p++; + lmbs = (struct of_drconf_cell *)p; + + for (i = 0; i < num_lmbs; i++) { + if (lmbs[i].drc_index == lmb->drc_index) { + lmbs[i].flags = lmb->flags; + lmbs[i].aa_index = lmb->aa_index; + + dlpar_update_drconf_property(dn, prop); + break; + } + } + + of_node_put(dn); + return 0; +} + +static u32 lookup_lmb_associativity_index(struct of_drconf_cell *lmb) +{ + struct device_node *parent, *lmb_node, *dr_node; + const u32 *lmb_assoc; + const u32 *assoc_arrays; + u32 aa_index; + int aa_arrays, aa_array_entries, aa_array_sz; + int i; + + parent = of_find_node_by_path("/"); + if (!parent) + return -ENODEV; + + lmb_node = dlpar_configure_connector(cpu_to_be32(lmb->drc_index), + parent); + of_node_put(parent); + if (!lmb_node) + return -EINVAL; + + lmb_assoc = of_get_property(lmb_node, "ibm,associativity", NULL); + if (!lmb_assoc) { + dlpar_free_cc_nodes(lmb_node); + return -ENODEV; + } + + dr_node = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); + if (!dr_node) { + dlpar_free_cc_nodes(lmb_node); + return -ENODEV; + } + + assoc_arrays = of_get_property(dr_node, + "ibm,associativity-lookup-arrays", + NULL); + of_node_put(dr_node); + if (!assoc_arrays) { + dlpar_free_cc_nodes(lmb_node); + return -ENODEV; + } + + /* The ibm,associativity-lookup-arrays property is defined to be + * a 32-bit value specifying the number of associativity arrays + * followed by a 32-bitvalue specifying the number of entries per + * array, followed by the associativity arrays. + */ + aa_arrays = be32_to_cpu(assoc_arrays[0]); + aa_array_entries = be32_to_cpu(assoc_arrays[1]); + aa_array_sz = aa_array_entries * sizeof(u32); + + aa_index = -1; + for (i = 0; i < aa_arrays; i++) { + int indx = (i * aa_array_entries) + 2; + + if (memcmp(&assoc_arrays[indx], &lmb_assoc[1], aa_array_sz)) + continue; + + aa_index = i; + break; + } + + dlpar_free_cc_nodes(lmb_node); + return aa_index; +} + +static int dlpar_add_device_tree_lmb(struct of_drconf_cell *lmb) +{ + int aa_index; + + lmb->flags |= DRCONF_MEM_ASSIGNED; + + aa_index = lookup_lmb_associativity_index(lmb); + if (aa_index < 0) { + pr_err("Couldn't find associativity index for drc index %x\n", + lmb->drc_index); + return aa_index; + } + + lmb->aa_index = aa_index; + return dlpar_update_device_tree_lmb(lmb); +} + +static int dlpar_remove_device_tree_lmb(struct of_drconf_cell *lmb) +{ + lmb->flags &= ~DRCONF_MEM_ASSIGNED; + lmb->aa_index = 0xffffffff; + return dlpar_update_device_tree_lmb(lmb); +} + static struct memory_block *lmb_to_memblock(struct of_drconf_cell *lmb) { unsigned long section_nr; @@ -243,8 +392,8 @@ static int dlpar_remove_lmb(struct of_drconf_cell *lmb) memblock_remove(lmb->base_addr, block_sz); dlpar_release_drc(lmb->drc_index); + dlpar_remove_device_tree_lmb(lmb); - lmb->flags &= ~DRCONF_MEM_ASSIGNED; return 0; } @@ -384,43 +533,32 @@ static int dlpar_memory_remove_by_index(u32 drc_index, struct property *prop) #endif /* CONFIG_MEMORY_HOTREMOVE */ -static int dlpar_add_lmb(struct of_drconf_cell *lmb) +static int dlpar_add_lmb_memory(struct of_drconf_cell *lmb) { struct memory_block *mem_block; unsigned long block_sz; int nid, rc; - if (lmb->flags & DRCONF_MEM_ASSIGNED) - return -EINVAL; - block_sz = memory_block_size_bytes(); - rc = dlpar_acquire_drc(lmb->drc_index); - if (rc) - return rc; - /* Find the node id for this address */ nid = memory_add_physaddr_to_nid(lmb->base_addr); /* Add the memory */ rc = add_memory(nid, lmb->base_addr, block_sz); - if (rc) { - dlpar_release_drc(lmb->drc_index); + if (rc) return rc; - } /* Register this block of memory */ rc = memblock_add(lmb->base_addr, block_sz); if (rc) { remove_memory(nid, lmb->base_addr, block_sz); - dlpar_release_drc(lmb->drc_index); return rc; } mem_block = lmb_to_memblock(lmb); if (!mem_block) { remove_memory(nid, lmb->base_addr, block_sz); - dlpar_release_drc(lmb->drc_index); return -EINVAL; } @@ -428,7 +566,6 @@ static int dlpar_add_lmb(struct of_drconf_cell *lmb) put_device(&mem_block->dev); if (rc) { remove_memory(nid, lmb->base_addr, block_sz); - dlpar_release_drc(lmb->drc_index); return rc; } @@ -436,6 +573,34 @@ static int dlpar_add_lmb(struct of_drconf_cell *lmb) return 0; } +static int dlpar_add_lmb(struct of_drconf_cell *lmb) +{ + int rc; + + if (lmb->flags & DRCONF_MEM_ASSIGNED) + return -EINVAL; + + rc = dlpar_acquire_drc(lmb->drc_index); + if (rc) + return rc; + + rc = dlpar_add_device_tree_lmb(lmb); + if (rc) { + pr_err("Couldn't update device tree for drc index %x\n", + lmb->drc_index); + dlpar_release_drc(lmb->drc_index); + return rc; + } + + rc = dlpar_add_lmb_memory(lmb); + if (rc) { + dlpar_remove_device_tree_lmb(lmb); + dlpar_release_drc(lmb->drc_index); + } + + return rc; +} + static int dlpar_memory_add_by_count(u32 lmbs_to_add, struct property *prop) { struct of_drconf_cell *lmbs; @@ -536,31 +701,6 @@ static int dlpar_memory_add_by_index(u32 drc_index, struct property *prop) return rc; } -static void dlpar_update_drconf_property(struct device_node *dn, - struct property *prop) -{ - struct of_drconf_cell *lmbs; - u32 num_lmbs, *p; - int i; - - /* Convert the property back to BE */ - p = prop->value; - num_lmbs = *p; - *p = cpu_to_be32(*p); - p++; - - lmbs = (struct of_drconf_cell *)p; - for (i = 0; i < num_lmbs; i++) { - lmbs[i].base_addr = cpu_to_be64(lmbs[i].base_addr); - lmbs[i].drc_index = cpu_to_be32(lmbs[i].drc_index); - lmbs[i].flags = cpu_to_be32(lmbs[i].flags); - } - - rtas_hp_event = true; - of_update_property(dn, prop); - rtas_hp_event = false; -} - int dlpar_memory(struct pseries_hp_errorlog *hp_elog) { struct device_node *dn; @@ -608,10 +748,7 @@ int dlpar_memory(struct pseries_hp_errorlog *hp_elog) break; } - if (rc) - dlpar_free_drconf_property(prop); - else - dlpar_update_drconf_property(dn, prop); + dlpar_free_drconf_property(prop); dlpar_memory_out: of_node_put(dn); diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index bd98ce2be17b..b7dfc1359d01 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -912,7 +912,8 @@ machine_arch_initcall(pseries, find_existing_ddw_windows); static int query_ddw(struct pci_dev *dev, const u32 *ddw_avail, struct ddw_query_response *query) { - struct eeh_dev *edev; + struct device_node *dn; + struct pci_dn *pdn; u32 cfg_addr; u64 buid; int ret; @@ -923,11 +924,10 @@ static int query_ddw(struct pci_dev *dev, const u32 *ddw_avail, * Retrieve them from the pci device, not the node with the * dma-window property */ - edev = pci_dev_to_eeh_dev(dev); - cfg_addr = edev->config_addr; - if (edev->pe_config_addr) - cfg_addr = edev->pe_config_addr; - buid = edev->phb->buid; + dn = pci_device_to_OF_node(dev); + pdn = PCI_DN(dn); + buid = pdn->phb->buid; + cfg_addr = (pdn->busno << 8) | pdn->devfn; ret = rtas_call(ddw_avail[0], 3, 5, (u32 *)query, cfg_addr, BUID_HI(buid), BUID_LO(buid)); @@ -941,7 +941,8 @@ static int create_ddw(struct pci_dev *dev, const u32 *ddw_avail, struct ddw_create_response *create, int page_shift, int window_shift) { - struct eeh_dev *edev; + struct device_node *dn; + struct pci_dn *pdn; u32 cfg_addr; u64 buid; int ret; @@ -952,11 +953,10 @@ static int create_ddw(struct pci_dev *dev, const u32 *ddw_avail, * Retrieve them from the pci device, not the node with the * dma-window property */ - edev = pci_dev_to_eeh_dev(dev); - cfg_addr = edev->config_addr; - if (edev->pe_config_addr) - cfg_addr = edev->pe_config_addr; - buid = edev->phb->buid; + dn = pci_device_to_OF_node(dev); + pdn = PCI_DN(dn); + buid = pdn->phb->buid; + cfg_addr = (pdn->busno << 8) | pdn->devfn; do { /* extra outputs are LIOBN and dma-addr (hi, lo) */ diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 2415a0d31f8f..7f6100d91b4b 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -89,18 +89,21 @@ void vpa_init(int cpu) "%lx failed with %ld\n", cpu, hwcpu, addr, ret); return; } + +#ifdef CONFIG_PPC_STD_MMU_64 /* * PAPR says this feature is SLB-Buffer but firmware never * reports that. All SPLPAR support SLB shadow buffer. */ - addr = __pa(paca[cpu].slb_shadow_ptr); - if (firmware_has_feature(FW_FEATURE_SPLPAR)) { + if (!radix_enabled() && firmware_has_feature(FW_FEATURE_SPLPAR)) { + addr = __pa(paca[cpu].slb_shadow_ptr); ret = register_slb_shadow(hwcpu, addr); if (ret) pr_err("WARNING: SLB shadow buffer registration for " "cpu %d (hw %d) of area %lx failed with %ld\n", cpu, hwcpu, addr, ret); } +#endif /* CONFIG_PPC_STD_MMU_64 */ /* * Register dispatch trace log, if one has been allocated. @@ -123,6 +126,8 @@ void vpa_init(int cpu) } } +#ifdef CONFIG_PPC_STD_MMU_64 + static long pSeries_lpar_hpte_insert(unsigned long hpte_group, unsigned long vpn, unsigned long pa, unsigned long rflags, unsigned long vflags, @@ -139,7 +144,7 @@ static long pSeries_lpar_hpte_insert(unsigned long hpte_group, hpte_group, vpn, pa, rflags, vflags, psize); hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID; - hpte_r = hpte_encode_r(pa, psize, apsize) | rflags; + hpte_r = hpte_encode_r(pa, psize, apsize, ssize) | rflags; if (!(vflags & HPTE_V_BOLTED)) pr_devel(" hpte_v=%016lx, hpte_r=%016lx\n", hpte_v, hpte_r); @@ -152,10 +157,6 @@ static long pSeries_lpar_hpte_insert(unsigned long hpte_group, /* Exact = 0 */ flags = 0; - /* Make pHyp happy */ - if ((rflags & _PAGE_NO_CACHE) && !(rflags & _PAGE_WRITETHRU)) - hpte_r &= ~HPTE_R_M; - if (firmware_has_feature(FW_FEATURE_XCMO) && !(hpte_r & HPTE_R_N)) flags |= H_COALESCE_CAND; @@ -659,6 +660,8 @@ static void pSeries_set_page_state(struct page *page, int order, void arch_free_page(struct page *page, int order) { + if (radix_enabled()) + return; if (!cmo_free_hint_flag || !firmware_has_feature(FW_FEATURE_CMO)) return; @@ -666,7 +669,8 @@ void arch_free_page(struct page *page, int order) } EXPORT_SYMBOL(arch_free_page); -#endif +#endif /* CONFIG_PPC_SMLPAR */ +#endif /* CONFIG_PPC_STD_MMU_64 */ #ifdef CONFIG_TRACEPOINTS #ifdef HAVE_JUMP_LABEL diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c index c9fecf09b8fa..afa05a2cb702 100644 --- a/arch/powerpc/platforms/pseries/lparcfg.c +++ b/arch/powerpc/platforms/pseries/lparcfg.c @@ -484,8 +484,9 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v) seq_printf(m, "shared_processor_mode=%d\n", lppaca_shared_proc(get_lppaca())); +#ifdef CONFIG_PPC_STD_MMU_64 seq_printf(m, "slb_size=%d\n", mmu_slb_size); - +#endif parse_em_data(m); return 0; diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c index ceb18d349459..a560a98bcf3b 100644 --- a/arch/powerpc/platforms/pseries/mobility.c +++ b/arch/powerpc/platforms/pseries/mobility.c @@ -191,8 +191,8 @@ static int update_dt_node(__be32 phandle, s32 scope) break; case 0x80000000: - prop = of_find_property(dn, prop_name, NULL); - of_remove_property(dn, prop); + of_remove_property(dn, of_find_property(dn, + prop_name, NULL)); prop = NULL; break; diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c index 272e9ec1ab54..543a6386f3eb 100644 --- a/arch/powerpc/platforms/pseries/msi.c +++ b/arch/powerpc/platforms/pseries/msi.c @@ -305,7 +305,7 @@ static int msi_quota_for_device(struct pci_dev *dev, int request) memset(&counts, 0, sizeof(struct msi_counts)); /* Work out how many devices we have below this PE */ - traverse_pci_devices(pe_dn, count_non_bridge_devices, &counts); + pci_traverse_device_nodes(pe_dn, count_non_bridge_devices, &counts); if (counts.num_devices == 0) { pr_err("rtas_msi: found 0 devices under PE for %s\n", @@ -320,7 +320,7 @@ static int msi_quota_for_device(struct pci_dev *dev, int request) /* else, we have some more calculating to do */ counts.requestor = pci_device_to_OF_node(dev); counts.request = request; - traverse_pci_devices(pe_dn, count_spare_msis, &counts); + pci_traverse_device_nodes(pe_dn, count_spare_msis, &counts); /* If the quota isn't an integer multiple of the total, we can * use the remainder as spare MSIs for anyone that wants them. */ diff --git a/arch/powerpc/platforms/pseries/pci_dlpar.c b/arch/powerpc/platforms/pseries/pci_dlpar.c index 5d4a3df59d0c..906dbaa97fe2 100644 --- a/arch/powerpc/platforms/pseries/pci_dlpar.c +++ b/arch/powerpc/platforms/pseries/pci_dlpar.c @@ -34,38 +34,6 @@ #include "pseries.h" -static struct pci_bus * -find_bus_among_children(struct pci_bus *bus, - struct device_node *dn) -{ - struct pci_bus *child = NULL; - struct pci_bus *tmp; - struct device_node *busdn; - - busdn = pci_bus_to_OF_node(bus); - if (busdn == dn) - return bus; - - list_for_each_entry(tmp, &bus->children, node) { - child = find_bus_among_children(tmp, dn); - if (child) - break; - }; - return child; -} - -struct pci_bus * -pcibios_find_pci_bus(struct device_node *dn) -{ - struct pci_dn *pdn = dn->data; - - if (!pdn || !pdn->phb || !pdn->phb->bus) - return NULL; - - return find_bus_among_children(pdn->phb->bus, dn); -} -EXPORT_SYMBOL_GPL(pcibios_find_pci_bus); - struct pci_controller *init_phb_dynamic(struct device_node *dn) { struct pci_controller *phb; diff --git a/arch/powerpc/platforms/pseries/reconfig.c b/arch/powerpc/platforms/pseries/reconfig.c index 7c7fcc042549..cc66c49f07aa 100644 --- a/arch/powerpc/platforms/pseries/reconfig.c +++ b/arch/powerpc/platforms/pseries/reconfig.c @@ -303,7 +303,6 @@ static int do_remove_property(char *buf, size_t bufsize) { struct device_node *np; char *tmp; - struct property *prop; buf = parse_node(buf, bufsize, &np); if (!np) @@ -316,9 +315,7 @@ static int do_remove_property(char *buf, size_t bufsize) if (strlen(buf) == 0) return -EINVAL; - prop = of_find_property(np, buf, NULL); - - return of_remove_property(np, prop); + return of_remove_property(np, of_find_property(np, buf, NULL)); } static int do_update_property(char *buf, size_t bufsize) diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index 6e944fc6e5f9..9883bc7ea007 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -235,6 +235,8 @@ static void __init pseries_discover_pic(void) for_each_node_by_name(np, "interrupt-controller") { typep = of_get_property(np, "compatible", NULL); + if (!typep) + continue; if (strstr(typep, "open-pic")) { pSeries_mpic_node = of_node_get(np); ppc_md.init_IRQ = pseries_mpic_init_IRQ; @@ -265,7 +267,7 @@ static int pci_dn_reconfig_notifier(struct notifier_block *nb, unsigned long act pdn = parent ? PCI_DN(parent) : NULL; if (pdn) { /* Create pdn and EEH device */ - update_dn_pci_info(np, pdn->phb); + pci_add_device_node_info(pdn->phb, np); eeh_dev_init(PCI_DN(np), pdn->phb); } diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c index 85729f49764f..0ef9df49f0f2 100644 --- a/arch/powerpc/sysdev/fsl_pci.c +++ b/arch/powerpc/sysdev/fsl_pci.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -527,6 +528,8 @@ int fsl_add_bridge(struct platform_device *pdev, int is_primary) u8 hdr_type, progif; struct device_node *dev; struct ccsr_pci __iomem *pci; + u16 temp; + u32 svr = mfspr(SPRN_SVR); dev = pdev->dev.of_node; @@ -596,6 +599,27 @@ int fsl_add_bridge(struct platform_device *pdev, int is_primary) PPC_INDIRECT_TYPE_SURPRESS_PRIMARY_BUS; if (fsl_pcie_check_link(hose)) hose->indirect_type |= PPC_INDIRECT_TYPE_NO_PCIE_LINK; + } else { + /* + * Set PBFR(PCI Bus Function Register)[10] = 1 to + * disable the combining of crossing cacheline + * boundary requests into one burst transaction. + * PCI-X operation is not affected. + * Fix erratum PCI 5 on MPC8548 + */ +#define PCI_BUS_FUNCTION 0x44 +#define PCI_BUS_FUNCTION_MDS 0x400 /* Master disable streaming */ + if (((SVR_SOC_VER(svr) == SVR_8543) || + (SVR_SOC_VER(svr) == SVR_8545) || + (SVR_SOC_VER(svr) == SVR_8547) || + (SVR_SOC_VER(svr) == SVR_8548)) && + !early_find_capability(hose, 0, 0, PCI_CAP_ID_PCIX)) { + early_read_config_word(hose, 0, 0, + PCI_BUS_FUNCTION, &temp); + temp |= PCI_BUS_FUNCTION_MDS; + early_write_config_word(hose, 0, 0, + PCI_BUS_FUNCTION, temp); + } } printk(KERN_INFO "Found FSL PCI host bridge at 0x%016llx. " diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c index afe3c7cd395d..7de45b2df366 100644 --- a/arch/powerpc/sysdev/mpic.c +++ b/arch/powerpc/sysdev/mpic.c @@ -2004,8 +2004,15 @@ static struct syscore_ops mpic_syscore_ops = { static int mpic_init_sys(void) { + int rc; + register_syscore_ops(&mpic_syscore_ops); - subsys_system_register(&mpic_subsys, NULL); + rc = subsys_system_register(&mpic_subsys, NULL); + if (rc) { + unregister_syscore_ops(&mpic_syscore_ops); + pr_err("mpic: Failed to register subsystem!\n"); + return rc; + } return 0; } diff --git a/arch/powerpc/xmon/Makefile b/arch/powerpc/xmon/Makefile index 436062dbb6e2..0b2f771593eb 100644 --- a/arch/powerpc/xmon/Makefile +++ b/arch/powerpc/xmon/Makefile @@ -7,7 +7,7 @@ UBSAN_SANITIZE := n ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) -obj-y += xmon.o nonstdio.o +obj-y += xmon.o nonstdio.o spr_access.o ifdef CONFIG_XMON_DISASSEMBLY obj-y += ppc-dis.o ppc-opc.o diff --git a/arch/powerpc/xmon/spr_access.S b/arch/powerpc/xmon/spr_access.S new file mode 100644 index 000000000000..84ad74213c83 --- /dev/null +++ b/arch/powerpc/xmon/spr_access.S @@ -0,0 +1,45 @@ +#include + +/* unsigned long xmon_mfspr(sprn, default_value) */ +_GLOBAL(xmon_mfspr) + ld r5, .Lmfspr_table@got(r2) + b xmon_mxspr + +/* void xmon_mtspr(sprn, new_value) */ +_GLOBAL(xmon_mtspr) + ld r5, .Lmtspr_table@got(r2) + b xmon_mxspr + +/* + * r3 = sprn + * r4 = default or new value + * r5 = table base + */ +xmon_mxspr: + /* + * To index into the table of mxsprs we need: + * i = (sprn & 0x3ff) * 8 + * or using rwlinm: + * i = (sprn << 3) & (0x3ff << 3) + */ + rlwinm r3, r3, 3, 0x3ff << 3 + add r5, r5, r3 + mtctr r5 + mr r3, r4 /* put default_value in r3 for mfspr */ + bctr + +.Lmfspr_table: + spr = 0 + .rept 1024 + mfspr r3, spr + blr + spr = spr + 1 + .endr + +.Lmtspr_table: + spr = 0 + .rept 1024 + mtspr spr, r4 + blr + spr = spr + 1 + .endr diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 942796fa4767..c5e155108be5 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -86,6 +86,7 @@ static char tmpstr[128]; static long bus_error_jmp[JMP_BUF_LEN]; static int catch_memory_errors; +static int catch_spr_faults; static long *xmon_fault_jmp[NR_CPUS]; /* Breakpoint stuff */ @@ -147,7 +148,7 @@ void getstring(char *, int); static void flush_input(void); static int inchar(void); static void take_input(char *); -static unsigned long read_spr(int); +static int read_spr(int, unsigned long *); static void write_spr(int, unsigned long); static void super_regs(void); static void remove_bpts(void); @@ -250,6 +251,9 @@ Commands:\n\ sdi # disassemble spu local store for spu # (in hex)\n" #endif " S print special registers\n\ + Sa print all SPRs\n\ + Sr # read SPR #\n\ + Sw #v write v to SPR #\n\ t print backtrace\n\ x exit monitor and recover\n\ X exit monitor and don't recover\n" @@ -442,6 +446,12 @@ static int xmon_core(struct pt_regs *regs, int fromipi) #ifdef CONFIG_SMP cpu = smp_processor_id(); if (cpumask_test_cpu(cpu, &cpus_in_xmon)) { + /* + * We catch SPR read/write faults here because the 0x700, 0xf60 + * etc. handlers don't call debugger_fault_handler(). + */ + if (catch_spr_faults) + longjmp(bus_error_jmp, 1); get_output_lock(); excprint(regs); printf("cpu 0x%x: Exception %lx %s in xmon, " @@ -1635,89 +1645,87 @@ static void cacheflush(void) catch_memory_errors = 0; } -static unsigned long -read_spr(int n) +extern unsigned long xmon_mfspr(int spr, unsigned long default_value); +extern void xmon_mtspr(int spr, unsigned long value); + +static int +read_spr(int n, unsigned long *vp) { - unsigned int instrs[2]; - unsigned long (*code)(void); unsigned long ret = -1UL; -#ifdef CONFIG_PPC64 - unsigned long opd[3]; - - opd[0] = (unsigned long)instrs; - opd[1] = 0; - opd[2] = 0; - code = (unsigned long (*)(void)) opd; -#else - code = (unsigned long (*)(void)) instrs; -#endif - - /* mfspr r3,n; blr */ - instrs[0] = 0x7c6002a6 + ((n & 0x1F) << 16) + ((n & 0x3e0) << 6); - instrs[1] = 0x4e800020; - store_inst(instrs); - store_inst(instrs+1); + int ok = 0; if (setjmp(bus_error_jmp) == 0) { - catch_memory_errors = 1; + catch_spr_faults = 1; sync(); - ret = code(); + ret = xmon_mfspr(n, *vp); sync(); - /* wait a little while to see if we get a machine check */ - __delay(200); - n = size; + *vp = ret; + ok = 1; } + catch_spr_faults = 0; - return ret; + return ok; } static void write_spr(int n, unsigned long val) { - unsigned int instrs[2]; - unsigned long (*code)(unsigned long); -#ifdef CONFIG_PPC64 - unsigned long opd[3]; - - opd[0] = (unsigned long)instrs; - opd[1] = 0; - opd[2] = 0; - code = (unsigned long (*)(unsigned long)) opd; -#else - code = (unsigned long (*)(unsigned long)) instrs; -#endif - - instrs[0] = 0x7c6003a6 + ((n & 0x1F) << 16) + ((n & 0x3e0) << 6); - instrs[1] = 0x4e800020; - store_inst(instrs); - store_inst(instrs+1); - if (setjmp(bus_error_jmp) == 0) { - catch_memory_errors = 1; + catch_spr_faults = 1; sync(); - code(val); + xmon_mtspr(n, val); sync(); - /* wait a little while to see if we get a machine check */ - __delay(200); - n = size; + } else { + printf("SPR 0x%03x (%4d) Faulted during write\n", n, n); } + catch_spr_faults = 0; } static unsigned long regno; extern char exc_prolog; extern char dec_exc; +static void dump_one_spr(int spr, bool show_unimplemented) +{ + unsigned long val; + + val = 0xdeadbeef; + if (!read_spr(spr, &val)) { + printf("SPR 0x%03x (%4d) Faulted during read\n", spr, spr); + return; + } + + if (val == 0xdeadbeef) { + /* Looks like read was a nop, confirm */ + val = 0x0badcafe; + if (!read_spr(spr, &val)) { + printf("SPR 0x%03x (%4d) Faulted during read\n", spr, spr); + return; + } + + if (val == 0x0badcafe) { + if (show_unimplemented) + printf("SPR 0x%03x (%4d) Unimplemented\n", spr, spr); + return; + } + } + + printf("SPR 0x%03x (%4d) = 0x%lx\n", spr, spr, val); +} + static void super_regs(void) { int cmd; - unsigned long val; + int spr; cmd = skipbl(); - if (cmd == '\n') { + + switch (cmd) { + case '\n': { unsigned long sp, toc; asm("mr %0,1" : "=r" (sp) :); asm("mr %0,2" : "=r" (toc) :); @@ -1730,21 +1738,29 @@ static void super_regs(void) mfspr(SPRN_DEC), mfspr(SPRN_SPRG2)); printf("sp = "REG" sprg3= "REG"\n", sp, mfspr(SPRN_SPRG3)); printf("toc = "REG" dar = "REG"\n", toc, mfspr(SPRN_DAR)); - return; } - - scanhex(®no); - switch (cmd) { - case 'w': - val = read_spr(regno); + case 'w': { + unsigned long val; + scanhex(®no); + val = 0; + read_spr(regno, &val); scanhex(&val); write_spr(regno, val); - /* fall through */ - case 'r': - printf("spr %lx = %lx\n", regno, read_spr(regno)); + dump_one_spr(regno, true); break; } + case 'r': + scanhex(®no); + dump_one_spr(regno, true); + break; + case 'a': + /* dump ALL SPRs */ + for (spr = 1; spr < 1024; ++spr) + dump_one_spr(spr, false); + break; + } + scannl(); } @@ -2913,7 +2929,7 @@ static void xmon_print_symbol(unsigned long address, const char *mid, printf("%s", after); } -#ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_STD_MMU_64 void dump_segments(void) { int i; diff --git a/drivers/cpufreq/pmac32-cpufreq.c b/drivers/cpufreq/pmac32-cpufreq.c index b7b576e53e92..ff44016ea031 100644 --- a/drivers/cpufreq/pmac32-cpufreq.c +++ b/drivers/cpufreq/pmac32-cpufreq.c @@ -300,7 +300,7 @@ static int pmu_set_cpu_speed(int low_speed) _set_L3CR(save_l3cr); /* Restore userland MMU context */ - switch_mmu_context(NULL, current->active_mm); + switch_mmu_context(NULL, current->active_mm, NULL); #ifdef DEBUG_FREQ printk(KERN_DEBUG "HID1, after: %x\n", mfspr(SPRN_HID1)); diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c index 24f4a782e0f4..ff946d5f59e4 100644 --- a/drivers/infiniband/hw/qib/qib_file_ops.c +++ b/drivers/infiniband/hw/qib/qib_file_ops.c @@ -824,10 +824,7 @@ static int mmap_piobufs(struct vm_area_struct *vma, phys = dd->physaddr + piobufs; #if defined(__powerpc__) - /* There isn't a generic way to specify writethrough mappings */ - pgprot_val(vma->vm_page_prot) |= _PAGE_NO_CACHE; - pgprot_val(vma->vm_page_prot) |= _PAGE_WRITETHRU; - pgprot_val(vma->vm_page_prot) &= ~_PAGE_GUARDED; + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); #endif /* diff --git a/drivers/infiniband/hw/qib/qib_pcie.c b/drivers/infiniband/hw/qib/qib_pcie.c index 4758a3801ae8..6abe1c621aa4 100644 --- a/drivers/infiniband/hw/qib/qib_pcie.c +++ b/drivers/infiniband/hw/qib/qib_pcie.c @@ -144,13 +144,7 @@ int qib_pcie_ddinit(struct qib_devdata *dd, struct pci_dev *pdev, addr = pci_resource_start(pdev, 0); len = pci_resource_len(pdev, 0); -#if defined(__powerpc__) - /* There isn't a generic way to specify writethrough mappings */ - dd->kregbase = __ioremap(addr, len, _PAGE_NO_CACHE | _PAGE_WRITETHRU); -#else dd->kregbase = ioremap_nocache(addr, len); -#endif - if (!dd->kregbase) return -ENOMEM; diff --git a/drivers/macintosh/rack-meter.c b/drivers/macintosh/rack-meter.c index caaec654d7ea..465c52219639 100644 --- a/drivers/macintosh/rack-meter.c +++ b/drivers/macintosh/rack-meter.c @@ -154,8 +154,8 @@ static void rackmeter_do_pause(struct rackmeter *rm, int pause) DBDMA_DO_STOP(rm->dma_regs); return; } - memset(rdma->buf1, 0, SAMPLE_COUNT & sizeof(u32)); - memset(rdma->buf2, 0, SAMPLE_COUNT & sizeof(u32)); + memset(rdma->buf1, 0, ARRAY_SIZE(rdma->buf1)); + memset(rdma->buf2, 0, ARRAY_SIZE(rdma->buf2)); rm->dma_buf_v->mark = 0; @@ -227,6 +227,7 @@ static void rackmeter_do_timer(struct work_struct *work) total_idle_ticks = get_cpu_idle_time(cpu); idle_ticks = (unsigned int) (total_idle_ticks - rcpu->prev_idle); + idle_ticks = min(idle_ticks, total_ticks); rcpu->prev_idle = total_idle_ticks; /* We do a very dumb calculation to update the LEDs for now, diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c index 01ee736fe0ef..f8b6d1403c16 100644 --- a/drivers/macintosh/via-pmu.c +++ b/drivers/macintosh/via-pmu.c @@ -1851,7 +1851,7 @@ static int powerbook_sleep_grackle(void) _set_L2CR(save_l2cr); /* Restore userland MMU context */ - switch_mmu_context(NULL, current->active_mm); + switch_mmu_context(NULL, current->active_mm, NULL); /* Power things up */ pmu_unlock(); @@ -1940,7 +1940,7 @@ powerbook_sleep_Core99(void) _set_L3CR(save_l3cr); /* Restore userland MMU context */ - switch_mmu_context(NULL, current->active_mm); + switch_mmu_context(NULL, current->active_mm, NULL); /* Tell PMU we are ready */ pmu_unlock(); diff --git a/drivers/misc/cxl/api.c b/drivers/misc/cxl/api.c index 2107c948406d..6d228ccd884d 100644 --- a/drivers/misc/cxl/api.c +++ b/drivers/misc/cxl/api.c @@ -68,15 +68,6 @@ struct cxl_context *cxl_get_context(struct pci_dev *dev) } EXPORT_SYMBOL_GPL(cxl_get_context); -struct device *cxl_get_phys_dev(struct pci_dev *dev) -{ - struct cxl_afu *afu; - - afu = cxl_pci_to_afu(dev); - - return afu->adapter->dev.parent; -} - int cxl_release_context(struct cxl_context *ctx) { if (ctx->status >= STARTED) @@ -192,6 +183,7 @@ int cxl_start_context(struct cxl_context *ctx, u64 wed, ctx->pid = get_task_pid(task, PIDTYPE_PID); ctx->glpid = get_task_pid(task->group_leader, PIDTYPE_PID); kernel = false; + ctx->real_mode = false; } cxl_ctx_get(); @@ -228,6 +220,24 @@ void cxl_set_master(struct cxl_context *ctx) } EXPORT_SYMBOL_GPL(cxl_set_master); +int cxl_set_translation_mode(struct cxl_context *ctx, bool real_mode) +{ + if (ctx->status == STARTED) { + /* + * We could potentially update the PE and issue an update LLCMD + * to support this, but it doesn't seem to have a good use case + * since it's trivial to just create a second kernel context + * with different translation modes, so until someone convinces + * me otherwise: + */ + return -EBUSY; + } + + ctx->real_mode = real_mode; + return 0; +} +EXPORT_SYMBOL_GPL(cxl_set_translation_mode); + /* wrappers around afu_* file ops which are EXPORTED */ int cxl_fd_open(struct inode *inode, struct file *file) { diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c index 7edea9c19199..26d206b1d08c 100644 --- a/drivers/misc/cxl/context.c +++ b/drivers/misc/cxl/context.c @@ -297,8 +297,7 @@ static void reclaim_ctx(struct rcu_head *rcu) if (ctx->kernelapi) kfree(ctx->mapping); - if (ctx->irq_bitmap) - kfree(ctx->irq_bitmap); + kfree(ctx->irq_bitmap); /* Drop ref to the afu device taken during cxl_context_init */ cxl_afu_put(ctx->afu); diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h index 73dc2a33da74..4fe50788ff45 100644 --- a/drivers/misc/cxl/cxl.h +++ b/drivers/misc/cxl/cxl.h @@ -178,15 +178,6 @@ static const cxl_p2n_reg_t CXL_PSL_WED_An = {0x0A0}; #define CXL_PSL_SR_An_MP (1ull << (63-62)) /* Master Process */ #define CXL_PSL_SR_An_LE (1ull << (63-63)) /* Little Endian */ -/****** CXL_PSL_LLCMD_An ****************************************************/ -#define CXL_LLCMD_TERMINATE 0x0001000000000000ULL -#define CXL_LLCMD_REMOVE 0x0002000000000000ULL -#define CXL_LLCMD_SUSPEND 0x0003000000000000ULL -#define CXL_LLCMD_RESUME 0x0004000000000000ULL -#define CXL_LLCMD_ADD 0x0005000000000000ULL -#define CXL_LLCMD_UPDATE 0x0006000000000000ULL -#define CXL_LLCMD_HANDLE_MASK 0x000000000000ffffULL - /****** CXL_PSL_ID_An ****************************************************/ #define CXL_PSL_ID_An_F (1ull << (63-31)) #define CXL_PSL_ID_An_L (1ull << (63-30)) @@ -376,11 +367,13 @@ struct cxl_afu_native { }; struct cxl_afu_guest { + struct cxl_afu *parent; u64 handle; phys_addr_t p2n_phys; u64 p2n_size; int max_ints; - struct mutex recovery_lock; + bool handle_err; + struct delayed_work work_err; int previous_state; }; @@ -524,6 +517,7 @@ struct cxl_context { bool pe_inserted; bool master; bool kernel; + bool real_mode; bool pending_irq; bool pending_fault; bool pending_afu_err; @@ -580,6 +574,7 @@ struct cxl { bool perst_loads_image; bool perst_select_user; bool perst_same_image; + bool psl_timebase_synced; }; int cxl_pci_alloc_one_irq(struct cxl *adapter); diff --git a/drivers/misc/cxl/fault.c b/drivers/misc/cxl/fault.c index 9a8650bcb042..377e650a2a1d 100644 --- a/drivers/misc/cxl/fault.c +++ b/drivers/misc/cxl/fault.c @@ -149,11 +149,13 @@ static void cxl_handle_page_fault(struct cxl_context *ctx, * update_mmu_cache() will not have loaded the hash since current->trap * is not a 0x400 or 0x300, so just call hash_page_mm() here. */ - access = _PAGE_PRESENT; + access = _PAGE_PRESENT | _PAGE_READ; if (dsisr & CXL_PSL_DSISR_An_S) - access |= _PAGE_RW; - if ((!ctx->kernel) || ~(dar & (1ULL << 63))) - access |= _PAGE_USER; + access |= _PAGE_WRITE; + + access |= _PAGE_PRIVILEGED; + if ((!ctx->kernel) || (REGION_ID(dar) == USER_REGION_ID)) + access &= ~_PAGE_PRIVILEGED; if (dsisr & DSISR_NOHPTE) inv_flags |= HPTE_NOHPTE_UPDATE; diff --git a/drivers/misc/cxl/guest.c b/drivers/misc/cxl/guest.c index 8213372de2b7..bc8d0b9870eb 100644 --- a/drivers/misc/cxl/guest.c +++ b/drivers/misc/cxl/guest.c @@ -178,6 +178,9 @@ static int afu_read_error_state(struct cxl_afu *afu, int *state_out) u64 state; int rc = 0; + if (!afu) + return -EIO; + rc = cxl_h_read_error_state(afu->guest->handle, &state); if (!rc) { WARN_ON(state != H_STATE_NORMAL && @@ -552,6 +555,17 @@ static int attach_afu_directed(struct cxl_context *ctx, u64 wed, u64 amr) elem->common.sstp0 = cpu_to_be64(ctx->sstp0); elem->common.sstp1 = cpu_to_be64(ctx->sstp1); + + /* + * Ensure we have at least one interrupt allocated to take faults for + * kernel contexts that may not have allocated any AFU IRQs at all: + */ + if (ctx->irqs.range[0] == 0) { + rc = afu_register_irqs(ctx, 0); + if (rc) + goto out_free; + } + for (r = 0; r < CXL_IRQ_RANGES; r++) { for (i = 0; i < ctx->irqs.range[r]; i++) { if (r == 0 && i == 0) { @@ -597,6 +611,7 @@ static int attach_afu_directed(struct cxl_context *ctx, u64 wed, u64 amr) enable_afu_irqs(ctx); } +out_free: free_page((u64)elem); return rc; } @@ -605,6 +620,9 @@ static int guest_attach_process(struct cxl_context *ctx, bool kernel, u64 wed, u { pr_devel("in %s\n", __func__); + if (ctx->real_mode) + return -EPERM; + ctx->kernel = kernel; if (ctx->afu->current_mode == CXL_MODE_DIRECTED) return attach_afu_directed(ctx, wed, amr); @@ -818,7 +836,6 @@ static int afu_update_state(struct cxl_afu *afu) switch (cur_state) { case H_STATE_NORMAL: afu->guest->previous_state = cur_state; - rc = 1; break; case H_STATE_DISABLE: @@ -834,7 +851,6 @@ static int afu_update_state(struct cxl_afu *afu) pci_error_handlers(afu, CXL_SLOT_RESET_EVENT, pci_channel_io_normal); pci_error_handlers(afu, CXL_RESUME_EVENT, 0); - rc = 1; } afu->guest->previous_state = 0; break; @@ -859,39 +875,30 @@ static int afu_update_state(struct cxl_afu *afu) return rc; } -static int afu_do_recovery(struct cxl_afu *afu) +static void afu_handle_errstate(struct work_struct *work) { - int rc; + struct cxl_afu_guest *afu_guest = + container_of(to_delayed_work(work), struct cxl_afu_guest, work_err); - /* many threads can arrive here, in case of detach_all for example. - * Only one needs to drive the recovery - */ - if (mutex_trylock(&afu->guest->recovery_lock)) { - rc = afu_update_state(afu); - mutex_unlock(&afu->guest->recovery_lock); - return rc; - } - return 0; + if (!afu_update_state(afu_guest->parent) && + afu_guest->previous_state == H_STATE_PERM_UNAVAILABLE) + return; + + if (afu_guest->handle_err == true) + schedule_delayed_work(&afu_guest->work_err, + msecs_to_jiffies(3000)); } static bool guest_link_ok(struct cxl *cxl, struct cxl_afu *afu) { int state; - if (afu) { - if (afu_read_error_state(afu, &state) || - state != H_STATE_NORMAL) { - if (afu_do_recovery(afu) > 0) { - /* check again in case we've just fixed it */ - if (!afu_read_error_state(afu, &state) && - state == H_STATE_NORMAL) - return true; - } - return false; - } + if (afu && (!afu_read_error_state(afu, &state))) { + if (state == H_STATE_NORMAL) + return true; } - return true; + return false; } static int afu_properties_look_ok(struct cxl_afu *afu) @@ -929,8 +936,6 @@ int cxl_guest_init_afu(struct cxl *adapter, int slice, struct device_node *afu_n return -ENOMEM; } - mutex_init(&afu->guest->recovery_lock); - if ((rc = dev_set_name(&afu->dev, "afu%i.%i", adapter->adapter_num, slice))) @@ -986,6 +991,15 @@ int cxl_guest_init_afu(struct cxl *adapter, int slice, struct device_node *afu_n afu->enabled = true; + /* + * wake up the cpu periodically to check the state + * of the AFU using "afu" stored in the guest structure. + */ + afu->guest->parent = afu; + afu->guest->handle_err = true; + INIT_DELAYED_WORK(&afu->guest->work_err, afu_handle_errstate); + schedule_delayed_work(&afu->guest->work_err, msecs_to_jiffies(1000)); + if ((rc = cxl_pci_vphb_add(afu))) dev_info(&afu->dev, "Can't register vPHB\n"); @@ -1014,6 +1028,10 @@ void cxl_guest_remove_afu(struct cxl_afu *afu) if (!afu) return; + /* flush and stop pending job */ + afu->guest->handle_err = false; + flush_delayed_work(&afu->guest->work_err); + cxl_pci_vphb_remove(afu); cxl_sysfs_afu_remove(afu); @@ -1101,6 +1119,12 @@ struct cxl *cxl_guest_init_adapter(struct device_node *np, struct platform_devic adapter->dev.release = release_adapter; dev_set_drvdata(&pdev->dev, adapter); + /* + * Hypervisor controls PSL timebase initialization (p1 register). + * On FW840, PSL is initialized. + */ + adapter->psl_timebase_synced = true; + if ((rc = cxl_of_read_adapter_handle(adapter, np))) goto err1; diff --git a/drivers/misc/cxl/native.c b/drivers/misc/cxl/native.c index ecf7557cd657..55d8a1459f28 100644 --- a/drivers/misc/cxl/native.c +++ b/drivers/misc/cxl/native.c @@ -186,16 +186,25 @@ static int spa_max_procs(int spa_size) int cxl_alloc_spa(struct cxl_afu *afu) { + unsigned spa_size; + /* Work out how many pages to allocate */ afu->native->spa_order = 0; do { afu->native->spa_order++; - afu->native->spa_size = (1 << afu->native->spa_order) * PAGE_SIZE; + spa_size = (1 << afu->native->spa_order) * PAGE_SIZE; + + if (spa_size > 0x100000) { + dev_warn(&afu->dev, "num_of_processes too large for the SPA, limiting to %i (0x%x)\n", + afu->native->spa_max_procs, afu->native->spa_size); + afu->num_procs = afu->native->spa_max_procs; + break; + } + + afu->native->spa_size = spa_size; afu->native->spa_max_procs = spa_max_procs(afu->native->spa_size); } while (afu->native->spa_max_procs < afu->num_procs); - WARN_ON(afu->native->spa_size > 0x100000); /* Max size supported by the hardware */ - if (!(afu->native->spa = (struct cxl_process_element *) __get_free_pages(GFP_KERNEL | __GFP_ZERO, afu->native->spa_order))) { pr_err("cxl_alloc_spa: Unable to allocate scheduled process area\n"); @@ -486,8 +495,9 @@ static u64 calculate_sr(struct cxl_context *ctx) if (mfspr(SPRN_LPCR) & LPCR_TC) sr |= CXL_PSL_SR_An_TC; if (ctx->kernel) { - sr |= CXL_PSL_SR_An_R | (mfmsr() & MSR_SF); - sr |= CXL_PSL_SR_An_HV; + if (!ctx->real_mode) + sr |= CXL_PSL_SR_An_R; + sr |= (mfmsr() & MSR_SF) | CXL_PSL_SR_An_HV; } else { sr |= CXL_PSL_SR_An_PR | CXL_PSL_SR_An_R; sr &= ~(CXL_PSL_SR_An_HV); @@ -526,6 +536,15 @@ static int attach_afu_directed(struct cxl_context *ctx, u64 wed, u64 amr) ctx->elem->common.sstp0 = cpu_to_be64(ctx->sstp0); ctx->elem->common.sstp1 = cpu_to_be64(ctx->sstp1); + /* + * Ensure we have the multiplexed PSL interrupt set up to take faults + * for kernel contexts that may not have allocated any AFU IRQs at all: + */ + if (ctx->irqs.range[0] == 0) { + ctx->irqs.offset[0] = ctx->afu->native->psl_hwirq; + ctx->irqs.range[0] = 1; + } + for (r = 0; r < CXL_IRQ_RANGES; r++) { ctx->elem->ivte_offsets[r] = cpu_to_be16(ctx->irqs.offset[r]); ctx->elem->ivte_ranges[r] = cpu_to_be16(ctx->irqs.range[r]); diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c index 2844e975bf79..a08fcc888a71 100644 --- a/drivers/misc/cxl/pci.c +++ b/drivers/misc/cxl/pci.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "cxl.h" #include @@ -321,12 +322,43 @@ static void dump_afu_descriptor(struct cxl_afu *afu) #undef show_reg } +#define CAPP_UNIT0_ID 0xBA +#define CAPP_UNIT1_ID 0XBE + +static u64 get_capp_unit_id(struct device_node *np) +{ + u32 phb_index; + + /* + * For chips other than POWER8NVL, we only have CAPP 0, + * irrespective of which PHB is used. + */ + if (!pvr_version_is(PVR_POWER8NVL)) + return CAPP_UNIT0_ID; + + /* + * For POWER8NVL, assume CAPP 0 is attached to PHB0 and + * CAPP 1 is attached to PHB1. + */ + if (of_property_read_u32(np, "ibm,phb-index", &phb_index)) + return 0; + + if (phb_index == 0) + return CAPP_UNIT0_ID; + + if (phb_index == 1) + return CAPP_UNIT1_ID; + + return 0; +} + static int init_implementation_adapter_regs(struct cxl *adapter, struct pci_dev *dev) { struct device_node *np; const __be32 *prop; u64 psl_dsnctl; u64 chipid; + u64 capp_unit_id; if (!(np = pnv_pci_get_phb_node(dev))) return -ENODEV; @@ -336,10 +368,19 @@ static int init_implementation_adapter_regs(struct cxl *adapter, struct pci_dev if (!np) return -ENODEV; chipid = be32_to_cpup(prop); + capp_unit_id = get_capp_unit_id(np); of_node_put(np); + if (!capp_unit_id) { + pr_err("cxl: invalid capp unit id\n"); + return -ENODEV; + } + psl_dsnctl = 0x0000900000000000ULL; /* pteupd ttype, scdone */ + psl_dsnctl |= (0x2ULL << (63-38)); /* MMIO hang pulse: 256 us */ /* Tell PSL where to route data to */ - psl_dsnctl = 0x02E8900002000000ULL | (chipid << (63-5)); + psl_dsnctl |= (chipid << (63-5)); + psl_dsnctl |= (capp_unit_id << (63-13)); + cxl_p1_write(adapter, CXL_PSL_DSNDCTL, psl_dsnctl); cxl_p1_write(adapter, CXL_PSL_RESLCKTO, 0x20000000200ULL); /* snoop write mask */ @@ -355,22 +396,24 @@ static int init_implementation_adapter_regs(struct cxl *adapter, struct pci_dev #define TBSYNC_CNT(n) (((u64)n & 0x7) << (63-6)) #define _2048_250MHZ_CYCLES 1 -static int cxl_setup_psl_timebase(struct cxl *adapter, struct pci_dev *dev) +static void cxl_setup_psl_timebase(struct cxl *adapter, struct pci_dev *dev) { u64 psl_tb; int delta; unsigned int retry = 0; struct device_node *np; + adapter->psl_timebase_synced = false; + if (!(np = pnv_pci_get_phb_node(dev))) - return -ENODEV; + return; /* Do not fail when CAPP timebase sync is not supported by OPAL */ of_node_get(np); if (! of_get_property(np, "ibm,capp-timebase-sync", NULL)) { of_node_put(np); - pr_err("PSL: Timebase sync: OPAL support missing\n"); - return 0; + dev_info(&dev->dev, "PSL timebase inactive: OPAL support missing\n"); + return; } of_node_put(np); @@ -389,8 +432,8 @@ static int cxl_setup_psl_timebase(struct cxl *adapter, struct pci_dev *dev) do { msleep(1); if (retry++ > 5) { - pr_err("PSL: Timebase sync: giving up!\n"); - return -EIO; + dev_info(&dev->dev, "PSL timebase can't synchronize\n"); + return; } psl_tb = cxl_p1_read(adapter, CXL_PSL_Timebase); delta = mftb() - psl_tb; @@ -398,7 +441,8 @@ static int cxl_setup_psl_timebase(struct cxl *adapter, struct pci_dev *dev) delta = -delta; } while (tb_to_ns(delta) > 16000); - return 0; + adapter->psl_timebase_synced = true; + return; } static int init_implementation_afu_regs(struct cxl_afu *afu) @@ -1144,8 +1188,8 @@ static int cxl_configure_adapter(struct cxl *adapter, struct pci_dev *dev) if ((rc = pnv_phb_to_cxl_mode(dev, OPAL_PHB_CAPI_MODE_SNOOP_ON))) goto err; - if ((rc = cxl_setup_psl_timebase(adapter, dev))) - goto err; + /* Ignore error, adapter init is not dependant on timebase sync */ + cxl_setup_psl_timebase(adapter, dev); if ((rc = cxl_native_register_psl_err_irq(adapter))) goto err; diff --git a/drivers/misc/cxl/sysfs.c b/drivers/misc/cxl/sysfs.c index 25913c08794c..b043c20f158f 100644 --- a/drivers/misc/cxl/sysfs.c +++ b/drivers/misc/cxl/sysfs.c @@ -57,6 +57,15 @@ static ssize_t image_loaded_show(struct device *device, return scnprintf(buf, PAGE_SIZE, "factory\n"); } +static ssize_t psl_timebase_synced_show(struct device *device, + struct device_attribute *attr, + char *buf) +{ + struct cxl *adapter = to_cxl_adapter(device); + + return scnprintf(buf, PAGE_SIZE, "%i\n", adapter->psl_timebase_synced); +} + static ssize_t reset_adapter_store(struct device *device, struct device_attribute *attr, const char *buf, size_t count) @@ -142,6 +151,7 @@ static struct device_attribute adapter_attrs[] = { __ATTR_RO(psl_revision), __ATTR_RO(base_image), __ATTR_RO(image_loaded), + __ATTR_RO(psl_timebase_synced), __ATTR_RW(load_image_on_perst), __ATTR_RW(perst_reloads_same_image), __ATTR(reset, S_IWUSR, NULL, reset_adapter_store), diff --git a/drivers/of/base.c b/drivers/of/base.c index b299de2b3afa..64018ebcc861 100644 --- a/drivers/of/base.c +++ b/drivers/of/base.c @@ -1777,6 +1777,9 @@ int of_remove_property(struct device_node *np, struct property *prop) unsigned long flags; int rc; + if (!prop) + return -ENODEV; + mutex_lock(&of_mutex); raw_spin_lock_irqsave(&devtree_lock, flags); diff --git a/drivers/pci/hotplug/rpadlpar_core.c b/drivers/pci/hotplug/rpadlpar_core.c index b46b57d870fc..dc67f39779ec 100644 --- a/drivers/pci/hotplug/rpadlpar_core.c +++ b/drivers/pci/hotplug/rpadlpar_core.c @@ -175,7 +175,7 @@ static int dlpar_add_pci_slot(char *drc_name, struct device_node *dn) struct pci_dev *dev; struct pci_controller *phb; - if (pcibios_find_pci_bus(dn)) + if (pci_find_bus_by_node(dn)) return -EINVAL; /* Add pci bus */ @@ -212,7 +212,7 @@ static int dlpar_remove_phb(char *drc_name, struct device_node *dn) struct pci_dn *pdn; int rc = 0; - if (!pcibios_find_pci_bus(dn)) + if (!pci_find_bus_by_node(dn)) return -EINVAL; /* If pci slot is hotpluggable, use hotplug to remove it */ @@ -356,7 +356,7 @@ int dlpar_remove_pci_slot(char *drc_name, struct device_node *dn) pci_lock_rescan_remove(); - bus = pcibios_find_pci_bus(dn); + bus = pci_find_bus_by_node(dn); if (!bus) { ret = -EINVAL; goto out; @@ -380,7 +380,7 @@ int dlpar_remove_pci_slot(char *drc_name, struct device_node *dn) } /* Remove all devices below slot */ - pcibios_remove_pci_devices(bus); + pci_hp_remove_devices(bus); /* Unmap PCI IO space */ if (pcibios_unmap_io_space(bus)) { diff --git a/drivers/pci/hotplug/rpaphp_core.c b/drivers/pci/hotplug/rpaphp_core.c index 611f6056221a..8d132024f06e 100644 --- a/drivers/pci/hotplug/rpaphp_core.c +++ b/drivers/pci/hotplug/rpaphp_core.c @@ -404,7 +404,7 @@ static int enable_slot(struct hotplug_slot *hotplug_slot) if (state == PRESENT) { pci_lock_rescan_remove(); - pcibios_add_pci_devices(slot->bus); + pci_hp_add_devices(slot->bus); pci_unlock_rescan_remove(); slot->state = CONFIGURED; } else if (state == EMPTY) { @@ -426,7 +426,7 @@ static int disable_slot(struct hotplug_slot *hotplug_slot) return -EINVAL; pci_lock_rescan_remove(); - pcibios_remove_pci_devices(slot->bus); + pci_hp_remove_devices(slot->bus); pci_unlock_rescan_remove(); vm_unmap_aliases(); diff --git a/drivers/pci/hotplug/rpaphp_pci.c b/drivers/pci/hotplug/rpaphp_pci.c index 7836d6913e67..ea41ea1d3c00 100644 --- a/drivers/pci/hotplug/rpaphp_pci.c +++ b/drivers/pci/hotplug/rpaphp_pci.c @@ -93,7 +93,7 @@ int rpaphp_enable_slot(struct slot *slot) if (rc) return rc; - bus = pcibios_find_pci_bus(slot->dn); + bus = pci_find_bus_by_node(slot->dn); if (!bus) { err("%s: no pci_bus for dn %s\n", __func__, slot->dn->full_name); return -EINVAL; @@ -116,7 +116,7 @@ int rpaphp_enable_slot(struct slot *slot) } if (list_empty(&bus->devices)) - pcibios_add_pci_devices(bus); + pci_hp_add_devices(bus); if (!list_empty(&bus->devices)) { info->adapter_status = CONFIGURED; diff --git a/drivers/pcmcia/electra_cf.c b/drivers/pcmcia/electra_cf.c index 61cf61ac621e..4d7bc3f4124a 100644 --- a/drivers/pcmcia/electra_cf.c +++ b/drivers/pcmcia/electra_cf.c @@ -228,7 +228,7 @@ static int electra_cf_probe(struct platform_device *ofdev) if (!cf->mem_base || !cf->io_virt || !cf->gpio_base || (__ioremap_at(io.start, cf->io_virt, cf->io_size, - _PAGE_NO_CACHE | _PAGE_GUARDED) == NULL)) { + pgprot_val(pgprot_noncached(__pgprot(0)))) == NULL)) { dev_err(device, "can't ioremap ranges\n"); status = -ENOMEM; goto fail1; diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 0582b72ef377..3054e3fa63ac 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -1188,7 +1188,8 @@ static int tce_iommu_attach_group(void *iommu_data, goto unlock_exit; } table_group_tmp = iommu_group_get_iommudata(tcegrp->grp); - if (table_group_tmp->ops != table_group->ops) { + if (table_group_tmp->ops->create_table != + table_group->ops->create_table) { pr_warn("tce_vfio: Group %d is incompatible with group %d\n", iommu_group_id(iommu_group), iommu_group_id(tcegrp->grp)); diff --git a/include/misc/cxl.h b/include/misc/cxl.h index 7d5e2613c7b8..56560c5781b4 100644 --- a/include/misc/cxl.h +++ b/include/misc/cxl.h @@ -126,6 +126,14 @@ int cxl_afu_reset(struct cxl_context *ctx); */ void cxl_set_master(struct cxl_context *ctx); +/* + * Sets the context to use real mode memory accesses to operate with + * translation disabled. Note that this only makes sense for kernel contexts + * under bare metal, and will not work with virtualisation. May only be + * performed on stopped contexts. + */ +int cxl_set_translation_mode(struct cxl_context *ctx, bool real_mode); + /* * Map and unmap the AFU Problem Space area. The amount and location mapped * depends on if this context is a master or slave. diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 7e8d792da963..a6c8252d7776 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3456,11 +3456,23 @@ struct ftrace_glob { int type; }; +/* + * If symbols in an architecture don't correspond exactly to the user-visible + * name of what they represent, it is possible to define this function to + * perform the necessary adjustments. +*/ +char * __weak arch_ftrace_match_adjust(char *str, const char *search) +{ + return str; +} + static int ftrace_match(char *str, struct ftrace_glob *g) { int matched = 0; int slen; + str = arch_ftrace_match_adjust(str, g->search); + switch (g->type) { case MATCH_FULL: if (strcmp(str, g->search) == 0) diff --git a/tools/perf/arch/powerpc/include/perf_regs.h b/tools/perf/arch/powerpc/include/perf_regs.h new file mode 100644 index 000000000000..75de0e92e71e --- /dev/null +++ b/tools/perf/arch/powerpc/include/perf_regs.h @@ -0,0 +1,69 @@ +#ifndef ARCH_PERF_REGS_H +#define ARCH_PERF_REGS_H + +#include +#include +#include + +#define PERF_REGS_MASK ((1ULL << PERF_REG_POWERPC_MAX) - 1) +#define PERF_REGS_MAX PERF_REG_POWERPC_MAX +#ifdef __powerpc64__ + #define PERF_SAMPLE_REGS_ABI PERF_SAMPLE_REGS_ABI_64 +#else + #define PERF_SAMPLE_REGS_ABI PERF_SAMPLE_REGS_ABI_32 +#endif + +#define PERF_REG_IP PERF_REG_POWERPC_NIP +#define PERF_REG_SP PERF_REG_POWERPC_R1 + +static const char *reg_names[] = { + [PERF_REG_POWERPC_R0] = "r0", + [PERF_REG_POWERPC_R1] = "r1", + [PERF_REG_POWERPC_R2] = "r2", + [PERF_REG_POWERPC_R3] = "r3", + [PERF_REG_POWERPC_R4] = "r4", + [PERF_REG_POWERPC_R5] = "r5", + [PERF_REG_POWERPC_R6] = "r6", + [PERF_REG_POWERPC_R7] = "r7", + [PERF_REG_POWERPC_R8] = "r8", + [PERF_REG_POWERPC_R9] = "r9", + [PERF_REG_POWERPC_R10] = "r10", + [PERF_REG_POWERPC_R11] = "r11", + [PERF_REG_POWERPC_R12] = "r12", + [PERF_REG_POWERPC_R13] = "r13", + [PERF_REG_POWERPC_R14] = "r14", + [PERF_REG_POWERPC_R15] = "r15", + [PERF_REG_POWERPC_R16] = "r16", + [PERF_REG_POWERPC_R17] = "r17", + [PERF_REG_POWERPC_R18] = "r18", + [PERF_REG_POWERPC_R19] = "r19", + [PERF_REG_POWERPC_R20] = "r20", + [PERF_REG_POWERPC_R21] = "r21", + [PERF_REG_POWERPC_R22] = "r22", + [PERF_REG_POWERPC_R23] = "r23", + [PERF_REG_POWERPC_R24] = "r24", + [PERF_REG_POWERPC_R25] = "r25", + [PERF_REG_POWERPC_R26] = "r26", + [PERF_REG_POWERPC_R27] = "r27", + [PERF_REG_POWERPC_R28] = "r28", + [PERF_REG_POWERPC_R29] = "r29", + [PERF_REG_POWERPC_R30] = "r30", + [PERF_REG_POWERPC_R31] = "r31", + [PERF_REG_POWERPC_NIP] = "nip", + [PERF_REG_POWERPC_MSR] = "msr", + [PERF_REG_POWERPC_ORIG_R3] = "orig_r3", + [PERF_REG_POWERPC_CTR] = "ctr", + [PERF_REG_POWERPC_LINK] = "link", + [PERF_REG_POWERPC_XER] = "xer", + [PERF_REG_POWERPC_CCR] = "ccr", + [PERF_REG_POWERPC_SOFTE] = "softe", + [PERF_REG_POWERPC_TRAP] = "trap", + [PERF_REG_POWERPC_DAR] = "dar", + [PERF_REG_POWERPC_DSISR] = "dsisr" +}; + +static inline const char *perf_reg_name(int id) +{ + return reg_names[id]; +} +#endif /* ARCH_PERF_REGS_H */ diff --git a/tools/perf/arch/powerpc/util/Build b/tools/perf/arch/powerpc/util/Build index c8fe2074d217..90ad64b231cd 100644 --- a/tools/perf/arch/powerpc/util/Build +++ b/tools/perf/arch/powerpc/util/Build @@ -1,6 +1,8 @@ libperf-y += header.o libperf-y += sym-handling.o libperf-y += kvm-stat.o +libperf-y += perf_regs.o libperf-$(CONFIG_DWARF) += dwarf-regs.o libperf-$(CONFIG_DWARF) += skip-callchain-idx.o +libperf-$(CONFIG_LIBUNWIND) += unwind-libunwind.o diff --git a/tools/perf/arch/powerpc/util/perf_regs.c b/tools/perf/arch/powerpc/util/perf_regs.c new file mode 100644 index 000000000000..a3c3e1ce6807 --- /dev/null +++ b/tools/perf/arch/powerpc/util/perf_regs.c @@ -0,0 +1,49 @@ +#include "../../perf.h" +#include "../../util/perf_regs.h" + +const struct sample_reg sample_reg_masks[] = { + SMPL_REG(r0, PERF_REG_POWERPC_R0), + SMPL_REG(r1, PERF_REG_POWERPC_R1), + SMPL_REG(r2, PERF_REG_POWERPC_R2), + SMPL_REG(r3, PERF_REG_POWERPC_R3), + SMPL_REG(r4, PERF_REG_POWERPC_R4), + SMPL_REG(r5, PERF_REG_POWERPC_R5), + SMPL_REG(r6, PERF_REG_POWERPC_R6), + SMPL_REG(r7, PERF_REG_POWERPC_R7), + SMPL_REG(r8, PERF_REG_POWERPC_R8), + SMPL_REG(r9, PERF_REG_POWERPC_R9), + SMPL_REG(r10, PERF_REG_POWERPC_R10), + SMPL_REG(r11, PERF_REG_POWERPC_R11), + SMPL_REG(r12, PERF_REG_POWERPC_R12), + SMPL_REG(r13, PERF_REG_POWERPC_R13), + SMPL_REG(r14, PERF_REG_POWERPC_R14), + SMPL_REG(r15, PERF_REG_POWERPC_R15), + SMPL_REG(r16, PERF_REG_POWERPC_R16), + SMPL_REG(r17, PERF_REG_POWERPC_R17), + SMPL_REG(r18, PERF_REG_POWERPC_R18), + SMPL_REG(r19, PERF_REG_POWERPC_R19), + SMPL_REG(r20, PERF_REG_POWERPC_R20), + SMPL_REG(r21, PERF_REG_POWERPC_R21), + SMPL_REG(r22, PERF_REG_POWERPC_R22), + SMPL_REG(r23, PERF_REG_POWERPC_R23), + SMPL_REG(r24, PERF_REG_POWERPC_R24), + SMPL_REG(r25, PERF_REG_POWERPC_R25), + SMPL_REG(r26, PERF_REG_POWERPC_R26), + SMPL_REG(r27, PERF_REG_POWERPC_R27), + SMPL_REG(r28, PERF_REG_POWERPC_R28), + SMPL_REG(r29, PERF_REG_POWERPC_R29), + SMPL_REG(r30, PERF_REG_POWERPC_R30), + SMPL_REG(r31, PERF_REG_POWERPC_R31), + SMPL_REG(nip, PERF_REG_POWERPC_NIP), + SMPL_REG(msr, PERF_REG_POWERPC_MSR), + SMPL_REG(orig_r3, PERF_REG_POWERPC_ORIG_R3), + SMPL_REG(ctr, PERF_REG_POWERPC_CTR), + SMPL_REG(link, PERF_REG_POWERPC_LINK), + SMPL_REG(xer, PERF_REG_POWERPC_XER), + SMPL_REG(ccr, PERF_REG_POWERPC_CCR), + SMPL_REG(softe, PERF_REG_POWERPC_SOFTE), + SMPL_REG(trap, PERF_REG_POWERPC_TRAP), + SMPL_REG(dar, PERF_REG_POWERPC_DAR), + SMPL_REG(dsisr, PERF_REG_POWERPC_DSISR), + SMPL_REG_END +}; diff --git a/tools/perf/arch/powerpc/util/unwind-libunwind.c b/tools/perf/arch/powerpc/util/unwind-libunwind.c new file mode 100644 index 000000000000..9e15f92ae49f --- /dev/null +++ b/tools/perf/arch/powerpc/util/unwind-libunwind.c @@ -0,0 +1,96 @@ +/* + * Copyright 2016 Chandan Kumar, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include "../../util/unwind.h" +#include "../../util/debug.h" + +int libunwind__arch_reg_id(int regnum) +{ + switch (regnum) { + case UNW_PPC64_R0: + return PERF_REG_POWERPC_R0; + case UNW_PPC64_R1: + return PERF_REG_POWERPC_R1; + case UNW_PPC64_R2: + return PERF_REG_POWERPC_R2; + case UNW_PPC64_R3: + return PERF_REG_POWERPC_R3; + case UNW_PPC64_R4: + return PERF_REG_POWERPC_R4; + case UNW_PPC64_R5: + return PERF_REG_POWERPC_R5; + case UNW_PPC64_R6: + return PERF_REG_POWERPC_R6; + case UNW_PPC64_R7: + return PERF_REG_POWERPC_R7; + case UNW_PPC64_R8: + return PERF_REG_POWERPC_R8; + case UNW_PPC64_R9: + return PERF_REG_POWERPC_R9; + case UNW_PPC64_R10: + return PERF_REG_POWERPC_R10; + case UNW_PPC64_R11: + return PERF_REG_POWERPC_R11; + case UNW_PPC64_R12: + return PERF_REG_POWERPC_R12; + case UNW_PPC64_R13: + return PERF_REG_POWERPC_R13; + case UNW_PPC64_R14: + return PERF_REG_POWERPC_R14; + case UNW_PPC64_R15: + return PERF_REG_POWERPC_R15; + case UNW_PPC64_R16: + return PERF_REG_POWERPC_R16; + case UNW_PPC64_R17: + return PERF_REG_POWERPC_R17; + case UNW_PPC64_R18: + return PERF_REG_POWERPC_R18; + case UNW_PPC64_R19: + return PERF_REG_POWERPC_R19; + case UNW_PPC64_R20: + return PERF_REG_POWERPC_R20; + case UNW_PPC64_R21: + return PERF_REG_POWERPC_R21; + case UNW_PPC64_R22: + return PERF_REG_POWERPC_R22; + case UNW_PPC64_R23: + return PERF_REG_POWERPC_R23; + case UNW_PPC64_R24: + return PERF_REG_POWERPC_R24; + case UNW_PPC64_R25: + return PERF_REG_POWERPC_R25; + case UNW_PPC64_R26: + return PERF_REG_POWERPC_R26; + case UNW_PPC64_R27: + return PERF_REG_POWERPC_R27; + case UNW_PPC64_R28: + return PERF_REG_POWERPC_R28; + case UNW_PPC64_R29: + return PERF_REG_POWERPC_R29; + case UNW_PPC64_R30: + return PERF_REG_POWERPC_R30; + case UNW_PPC64_R31: + return PERF_REG_POWERPC_R31; + case UNW_PPC64_LR: + return PERF_REG_POWERPC_LINK; + case UNW_PPC64_CTR: + return PERF_REG_POWERPC_CTR; + case UNW_PPC64_XER: + return PERF_REG_POWERPC_XER; + case UNW_PPC64_NIP: + return PERF_REG_POWERPC_NIP; + default: + pr_err("unwind: invalid reg id %d\n", regnum); + return -EINVAL; + } + return -EINVAL; +} diff --git a/tools/perf/config/Makefile b/tools/perf/config/Makefile index 1e46277286c2..5ad0255f8756 100644 --- a/tools/perf/config/Makefile +++ b/tools/perf/config/Makefile @@ -23,6 +23,12 @@ $(call detected_var,ARCH) NO_PERF_REGS := 1 +# Additional ARCH settings for ppc +ifeq ($(ARCH),powerpc) + NO_PERF_REGS := 0 + LIBUNWIND_LIBS := -lunwind -lunwind-ppc64 +endif + # Additional ARCH settings for x86 ifeq ($(ARCH),x86) $(call detected,CONFIG_X86) diff --git a/tools/perf/util/perf_regs.c b/tools/perf/util/perf_regs.c index 6b8eb13e14e4..c4023f22f287 100644 --- a/tools/perf/util/perf_regs.c +++ b/tools/perf/util/perf_regs.c @@ -12,18 +12,18 @@ int perf_reg_value(u64 *valp, struct regs_dump *regs, int id) int i, idx = 0; u64 mask = regs->mask; - if (regs->cache_mask & (1 << id)) + if (regs->cache_mask & (1ULL << id)) goto out; - if (!(mask & (1 << id))) + if (!(mask & (1ULL << id))) return -EINVAL; for (i = 0; i < id; i++) { - if (mask & (1 << i)) + if (mask & (1ULL << i)) idx++; } - regs->cache_mask |= (1 << id); + regs->cache_mask |= (1ULL << id); regs->cache_regs[id] = regs->regs[idx]; out: diff --git a/tools/testing/selftests/powerpc/Makefile b/tools/testing/selftests/powerpc/Makefile index b08f77cbe31b..4ca83fe80654 100644 --- a/tools/testing/selftests/powerpc/Makefile +++ b/tools/testing/selftests/powerpc/Makefile @@ -14,6 +14,7 @@ export CFLAGS SUB_DIRS = benchmarks \ copyloops \ + context_switch \ dscr \ mm \ pmu \ diff --git a/tools/testing/selftests/powerpc/context_switch/.gitignore b/tools/testing/selftests/powerpc/context_switch/.gitignore new file mode 100644 index 000000000000..c1431af7b51c --- /dev/null +++ b/tools/testing/selftests/powerpc/context_switch/.gitignore @@ -0,0 +1 @@ +cp_abort diff --git a/tools/testing/selftests/powerpc/context_switch/Makefile b/tools/testing/selftests/powerpc/context_switch/Makefile new file mode 100644 index 000000000000..e164d1466466 --- /dev/null +++ b/tools/testing/selftests/powerpc/context_switch/Makefile @@ -0,0 +1,10 @@ +TEST_PROGS := cp_abort + +all: $(TEST_PROGS) + +$(TEST_PROGS): ../harness.c ../utils.c + +include ../../lib.mk + +clean: + rm -f $(TEST_PROGS) diff --git a/tools/testing/selftests/powerpc/context_switch/cp_abort.c b/tools/testing/selftests/powerpc/context_switch/cp_abort.c new file mode 100644 index 000000000000..5a5b55afda0e --- /dev/null +++ b/tools/testing/selftests/powerpc/context_switch/cp_abort.c @@ -0,0 +1,110 @@ +/* + * Adapted from Anton Blanchard's context switch microbenchmark. + * + * Copyright 2009, Anton Blanchard, IBM Corporation. + * Copyright 2016, Mikey Neuling, Chris Smart, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * This program tests the copy paste abort functionality of a P9 + * (or later) by setting up two processes on the same CPU, one + * which executes the copy instruction and the other which + * executes paste. + * + * The paste instruction should never succeed, as the cp_abort + * instruction is called by the kernel during a context switch. + * + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include "utils.h" +#include + +#define READ_FD 0 +#define WRITE_FD 1 + +#define NUM_LOOPS 1000 + +/* This defines the "paste" instruction from Power ISA 3.0 Book II, section 4.4. */ +#define PASTE(RA, RB, L, RC) \ + .long (0x7c00070c | (RA) << (31-15) | (RB) << (31-20) | (L) << (31-10) | (RC) << (31-31)) + +int paste(void *i) +{ + int cr; + + asm volatile(str(PASTE(0, %1, 1, 1))";" + "mfcr %0;" + : "=r" (cr) + : "b" (i) + : "memory" + ); + return cr; +} + +/* This defines the "copy" instruction from Power ISA 3.0 Book II, section 4.4. */ +#define COPY(RA, RB, L) \ + .long (0x7c00060c | (RA) << (31-15) | (RB) << (31-20) | (L) << (31-10)) + +void copy(void *i) +{ + asm volatile(str(COPY(0, %0, 1))";" + : + : "b" (i) + : "memory" + ); +} + +int test_cp_abort(void) +{ + /* 128 bytes for a full cache line */ + char buf[128] __cacheline_aligned; + cpu_set_t cpuset; + int fd1[2], fd2[2], pid; + char c; + + /* only run this test on a P9 or later */ + SKIP_IF(!have_hwcap2(PPC_FEATURE2_ARCH_3_00)); + + /* + * Run both processes on the same CPU, so that copy is more likely + * to leak into a paste. + */ + CPU_ZERO(&cpuset); + CPU_SET(pick_online_cpu(), &cpuset); + FAIL_IF(sched_setaffinity(0, sizeof(cpuset), &cpuset)); + + FAIL_IF(pipe(fd1) || pipe(fd2)); + + pid = fork(); + FAIL_IF(pid < 0); + + if (!pid) { + for (int i = 0; i < NUM_LOOPS; i++) { + FAIL_IF((write(fd1[WRITE_FD], &c, 1)) != 1); + FAIL_IF((read(fd2[READ_FD], &c, 1)) != 1); + /* A paste succeeds if CR0 EQ bit is set */ + FAIL_IF(paste(buf) & 0x20000000); + } + } else { + for (int i = 0; i < NUM_LOOPS; i++) { + FAIL_IF((read(fd1[READ_FD], &c, 1)) != 1); + copy(buf); + FAIL_IF((write(fd2[WRITE_FD], &c, 1) != 1)); + } + } + return 0; + +} + +int main(int argc, char *argv[]) +{ + return test_harness(test_cp_abort, "cp_abort"); +} diff --git a/tools/testing/selftests/powerpc/mm/subpage_prot.c b/tools/testing/selftests/powerpc/mm/subpage_prot.c index 440180ff8089..35ade7406dcd 100644 --- a/tools/testing/selftests/powerpc/mm/subpage_prot.c +++ b/tools/testing/selftests/powerpc/mm/subpage_prot.c @@ -73,7 +73,7 @@ static inline void check_faulted(void *addr, long page, long subpage, int write) want_fault |= (subpage == ((page + 1) % 16)); if (faulted != want_fault) { - printf("Failed at 0x%p (p=%ld,sp=%ld,w=%d), want=%s, got=%s !\n", + printf("Failed at %p (p=%ld,sp=%ld,w=%d), want=%s, got=%s !\n", addr, page, subpage, write, want_fault ? "fault" : "pass", faulted ? "fault" : "pass"); @@ -82,7 +82,7 @@ static inline void check_faulted(void *addr, long page, long subpage, int write) if (faulted) { if (dar != addr) { - printf("Fault expected at 0x%p and happened at 0x%p !\n", + printf("Fault expected at %p and happened at %p !\n", addr, dar); } faulted = 0; @@ -162,7 +162,7 @@ int test_anon(void) mallocblock = (void *)align; - printf("allocated malloc block of 0x%lx bytes at 0x%p\n", + printf("allocated malloc block of 0x%lx bytes at %p\n", mallocsize, mallocblock); printf("testing malloc block...\n"); @@ -197,7 +197,7 @@ int test_file(void) perror("failed to map file"); return 1; } - printf("allocated %s for 0x%lx bytes at 0x%p\n", + printf("allocated %s for 0x%lx bytes at %p\n", file_name, filesize, fileblock); printf("testing file map...\n"); @@ -207,14 +207,16 @@ int test_file(void) int main(int argc, char *argv[]) { - test_harness(test_anon, "subpage_prot_anon"); + int rc; + + rc = test_harness(test_anon, "subpage_prot_anon"); + if (rc) + return rc; if (argc > 1) file_name = argv[1]; else file_name = "tempfile"; - test_harness(test_file, "subpage_prot_file"); - - return 0; + return test_harness(test_file, "subpage_prot_file"); } diff --git a/tools/testing/selftests/powerpc/pmu/ebb/ebb.c b/tools/testing/selftests/powerpc/pmu/ebb/ebb.c index e67452f1bcff..46681fec549b 100644 --- a/tools/testing/selftests/powerpc/pmu/ebb/ebb.c +++ b/tools/testing/selftests/powerpc/pmu/ebb/ebb.c @@ -15,7 +15,6 @@ #include #include "trace.h" -#include "reg.h" #include "ebb.h" diff --git a/tools/testing/selftests/powerpc/pmu/ebb/reg_access_test.c b/tools/testing/selftests/powerpc/pmu/ebb/reg_access_test.c index 5b1188f10c15..f923228bca22 100644 --- a/tools/testing/selftests/powerpc/pmu/ebb/reg_access_test.c +++ b/tools/testing/selftests/powerpc/pmu/ebb/reg_access_test.c @@ -7,7 +7,6 @@ #include #include "ebb.h" -#include "reg.h" /* diff --git a/tools/testing/selftests/powerpc/pmu/ebb/reg.h b/tools/testing/selftests/powerpc/reg.h similarity index 74% rename from tools/testing/selftests/powerpc/pmu/ebb/reg.h rename to tools/testing/selftests/powerpc/reg.h index 5921b0dfe2e9..65bfdeeebdee 100644 --- a/tools/testing/selftests/powerpc/pmu/ebb/reg.h +++ b/tools/testing/selftests/powerpc/reg.h @@ -9,12 +9,12 @@ #define __stringify_1(x) #x #define __stringify(x) __stringify_1(x) -#define mfspr(rn) ({unsigned long rval; \ - asm volatile("mfspr %0," __stringify(rn) \ - : "=r" (rval)); rval; }) -#define mtspr(rn, v) asm volatile("mtspr " __stringify(rn) ",%0" : \ - : "r" ((unsigned long)(v)) \ - : "memory") +#define mfspr(rn) ({unsigned long rval; \ + asm volatile("mfspr %0," _str(rn) \ + : "=r" (rval)); rval; }) +#define mtspr(rn, v) asm volatile("mtspr " _str(rn) ",%0" : \ + : "r" ((unsigned long)(v)) \ + : "memory") #define mb() asm volatile("sync" : : : "memory"); @@ -46,4 +46,10 @@ #define SPRN_SDAR 781 #define SPRN_SIER 768 +#define SPRN_TEXASR 0x82 +#define SPRN_TFIAR 0x81 /* Transaction Failure Inst Addr */ +#define SPRN_TFHAR 0x80 /* Transaction Failure Handler Addr */ +#define TEXASR_FS 0x08000000 +#define SPRN_TAR 0x32f + #endif /* _SELFTESTS_POWERPC_REG_H */ diff --git a/tools/testing/selftests/powerpc/tm/.gitignore b/tools/testing/selftests/powerpc/tm/.gitignore index 7d0f14b8cb2e..bb942db845bf 100644 --- a/tools/testing/selftests/powerpc/tm/.gitignore +++ b/tools/testing/selftests/powerpc/tm/.gitignore @@ -3,3 +3,6 @@ tm-syscall tm-signal-msr-resv tm-signal-stack tm-vmxcopy +tm-fork +tm-tar +tm-tmspr diff --git a/tools/testing/selftests/powerpc/tm/Makefile b/tools/testing/selftests/powerpc/tm/Makefile index 737f72c964e6..d0505dbd22d5 100644 --- a/tools/testing/selftests/powerpc/tm/Makefile +++ b/tools/testing/selftests/powerpc/tm/Makefile @@ -1,4 +1,4 @@ -TEST_PROGS := tm-resched-dscr tm-syscall tm-signal-msr-resv tm-signal-stack tm-vmxcopy +TEST_PROGS := tm-resched-dscr tm-syscall tm-signal-msr-resv tm-signal-stack tm-vmxcopy tm-fork tm-tar tm-tmspr all: $(TEST_PROGS) @@ -6,6 +6,7 @@ $(TEST_PROGS): ../harness.c ../utils.c tm-syscall: tm-syscall-asm.S tm-syscall: CFLAGS += -mhtm -I../../../../../usr/include +tm-tmspr: CFLAGS += -pthread include ../../lib.mk diff --git a/tools/testing/selftests/powerpc/tm/tm-fork.c b/tools/testing/selftests/powerpc/tm/tm-fork.c new file mode 100644 index 000000000000..8d48579b7778 --- /dev/null +++ b/tools/testing/selftests/powerpc/tm/tm-fork.c @@ -0,0 +1,42 @@ +/* + * Copyright 2015, Michael Neuling, IBM Corp. + * Licensed under GPLv2. + * + * Edited: Rashmica Gupta, Nov 2015 + * + * This test does a fork syscall inside a transaction. Basic sniff test + * to see if we can enter the kernel during a transaction. + */ + +#include +#include +#include +#include +#include +#include + +#include "utils.h" +#include "tm.h" + +int test_fork(void) +{ + SKIP_IF(!have_htm()); + + asm __volatile__( + "tbegin.;" + "blt 1f; " + "li 0, 2;" /* fork syscall */ + "sc ;" + "tend.;" + "1: ;" + : : : "memory", "r0"); + /* If we reach here, we've passed. Otherwise we've probably crashed + * the kernel */ + + return 0; +} + +int main(int argc, char *argv[]) +{ + return test_harness(test_fork, "tm_fork"); +} diff --git a/tools/testing/selftests/powerpc/tm/tm-resched-dscr.c b/tools/testing/selftests/powerpc/tm/tm-resched-dscr.c index 8fde93d6021f..d9c49f41515e 100644 --- a/tools/testing/selftests/powerpc/tm/tm-resched-dscr.c +++ b/tools/testing/selftests/powerpc/tm/tm-resched-dscr.c @@ -31,12 +31,6 @@ #include "utils.h" #include "tm.h" -#define TBEGIN ".long 0x7C00051D ;" -#define TEND ".long 0x7C00055D ;" -#define TCHECK ".long 0x7C00059C ;" -#define TSUSPEND ".long 0x7C0005DD ;" -#define TRESUME ".long 0x7C2005DD ;" -#define SPRN_TEXASR 0x82 #define SPRN_DSCR 0x03 int test_body(void) @@ -55,13 +49,13 @@ int test_body(void) "mtspr %[sprn_dscr], 3;" /* start and suspend a transaction */ - TBEGIN + "tbegin.;" "beq 1f;" - TSUSPEND + "tsuspend.;" /* hard loop until the transaction becomes doomed */ "2: ;" - TCHECK + "tcheck 0;" "bc 4, 0, 2b;" /* record DSCR and TEXASR */ @@ -70,8 +64,8 @@ int test_body(void) "mfspr 3, %[sprn_texasr];" "std 3, %[texasr];" - TRESUME - TEND + "tresume.;" + "tend.;" "li %[rv], 0;" "1: ;" : [rv]"=r"(rv), [dscr2]"=m"(dscr2), [texasr]"=m"(texasr) diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-stack.c b/tools/testing/selftests/powerpc/tm/tm-signal-stack.c index e44a238c1d77..1f0eb567438d 100644 --- a/tools/testing/selftests/powerpc/tm/tm-signal-stack.c +++ b/tools/testing/selftests/powerpc/tm/tm-signal-stack.c @@ -60,9 +60,9 @@ int tm_signal_stack() exit(1); asm volatile("li 1, 0 ;" /* stack ptr == NULL */ "1:" - ".long 0x7C00051D ;" /* tbegin */ + "tbegin.;" "beq 1b ;" /* retry forever */ - ".long 0x7C0005DD ; ;" /* tsuspend */ + "tsuspend.;" "ld 2, 0(1) ;" /* trigger segv" */ : : : "memory"); diff --git a/tools/testing/selftests/powerpc/tm/tm-tar.c b/tools/testing/selftests/powerpc/tm/tm-tar.c new file mode 100644 index 000000000000..2d2fcc2b7a60 --- /dev/null +++ b/tools/testing/selftests/powerpc/tm/tm-tar.c @@ -0,0 +1,90 @@ +/* + * Copyright 2015, Michael Neuling, IBM Corp. + * Licensed under GPLv2. + * Original: Michael Neuling 19/7/2013 + * Edited: Rashmica Gupta 01/12/2015 + * + * Do some transactions, see if the tar is corrupted. + * If the transaction is aborted, the TAR should be rolled back to the + * checkpointed value before the transaction began. The value written to + * TAR in suspended mode should only remain in TAR if the transaction + * completes. + */ + +#include +#include +#include +#include + +#include "tm.h" +#include "utils.h" + +int num_loops = 10000; + +int test_tar(void) +{ + int i; + + SKIP_IF(!have_htm()); + + for (i = 0; i < num_loops; i++) + { + uint64_t result = 0; + asm __volatile__( + "li 7, 1;" + "mtspr %[tar], 7;" /* tar = 1 */ + "tbegin.;" + "beq 3f;" + "li 4, 0x7000;" /* Loop lots, to use time */ + "2:;" /* Start loop */ + "li 7, 2;" + "mtspr %[tar], 7;" /* tar = 2 */ + "tsuspend.;" + "li 7, 3;" + "mtspr %[tar], 7;" /* tar = 3 */ + "tresume.;" + "subi 4, 4, 1;" + "cmpdi 4, 0;" + "bne 2b;" + "tend.;" + + /* Transaction sucess! TAR should be 3 */ + "mfspr 7, %[tar];" + "ori %[res], 7, 4;" // res = 3|4 = 7 + "b 4f;" + + /* Abort handler. TAR should be rolled back to 1 */ + "3:;" + "mfspr 7, %[tar];" + "ori %[res], 7, 8;" // res = 1|8 = 9 + "4:;" + + : [res]"=r"(result) + : [tar]"i"(SPRN_TAR) + : "memory", "r0", "r4", "r7"); + + /* If result is anything else other than 7 or 9, the tar + * value must have been corrupted. */ + if ((result != 7) && (result != 9)) + return 1; + } + return 0; +} + +int main(int argc, char *argv[]) +{ + /* A low number of iterations (eg 100) can cause a false pass */ + if (argc > 1) { + if (strcmp(argv[1], "-h") == 0) { + printf("Syntax:\n\t%s []\n", + argv[0]); + return 1; + } else { + num_loops = atoi(argv[1]); + } + } + + printf("Starting, %d loops\n", num_loops); + + return test_harness(test_tar, "tm_tar"); +} diff --git a/tools/testing/selftests/powerpc/tm/tm-tmspr.c b/tools/testing/selftests/powerpc/tm/tm-tmspr.c new file mode 100644 index 000000000000..2bda81c7bf23 --- /dev/null +++ b/tools/testing/selftests/powerpc/tm/tm-tmspr.c @@ -0,0 +1,143 @@ +/* + * Copyright 2015, Michael Neuling, IBM Corp. + * Licensed under GPLv2. + * + * Original: Michael Neuling 3/4/2014 + * Modified: Rashmica Gupta 8/12/2015 + * + * Check if any of the Transaction Memory SPRs get corrupted. + * - TFIAR - stores address of location of transaction failure + * - TFHAR - stores address of software failure handler (if transaction + * fails) + * - TEXASR - lots of info about the transacion(s) + * + * (1) create more threads than cpus + * (2) in each thread: + * (a) set TFIAR and TFHAR a unique value + * (b) loop for awhile, continually checking to see if + * either register has been corrupted. + * + * (3) Loop: + * (a) begin transaction + * (b) abort transaction + * (c) check TEXASR to see if FS has been corrupted + * + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include + +#include "utils.h" +#include "tm.h" + +int num_loops = 10000; +int passed = 1; + +void tfiar_tfhar(void *in) +{ + int i, cpu; + unsigned long tfhar, tfhar_rd, tfiar, tfiar_rd; + cpu_set_t cpuset; + + CPU_ZERO(&cpuset); + cpu = (unsigned long)in >> 1; + CPU_SET(cpu, &cpuset); + sched_setaffinity(0, sizeof(cpuset), &cpuset); + + /* TFIAR: Last bit has to be high so userspace can read register */ + tfiar = ((unsigned long)in) + 1; + tfiar += 2; + mtspr(SPRN_TFIAR, tfiar); + + /* TFHAR: Last two bits are reserved */ + tfhar = ((unsigned long)in); + tfhar &= ~0x3UL; + tfhar += 4; + mtspr(SPRN_TFHAR, tfhar); + + for (i = 0; i < num_loops; i++) { + tfhar_rd = mfspr(SPRN_TFHAR); + tfiar_rd = mfspr(SPRN_TFIAR); + if ( (tfhar != tfhar_rd) || (tfiar != tfiar_rd) ) { + passed = 0; + return; + } + } + return; +} + +void texasr(void *in) +{ + unsigned long i; + uint64_t result = 0; + + for (i = 0; i < num_loops; i++) { + asm __volatile__( + "tbegin.;" + "beq 3f ;" + "tabort. 0 ;" + "tend.;" + + /* Abort handler */ + "3: ;" + ::: "memory"); + + /* Check the TEXASR */ + result = mfspr(SPRN_TEXASR); + if ((result & TEXASR_FS) == 0) { + passed = 0; + return; + } + } + return; +} + +int test_tmspr() +{ + pthread_t thread; + int thread_num; + unsigned long i; + + SKIP_IF(!have_htm()); + + /* To cause some context switching */ + thread_num = 10 * sysconf(_SC_NPROCESSORS_ONLN); + + /* Test TFIAR and TFHAR */ + for (i = 0 ; i < thread_num ; i += 2){ + if (pthread_create(&thread, NULL, (void*)tfiar_tfhar, (void *)i)) + return EXIT_FAILURE; + } + if (pthread_join(thread, NULL) != 0) + return EXIT_FAILURE; + + /* Test TEXASR */ + for (i = 0 ; i < thread_num ; i++){ + if (pthread_create(&thread, NULL, (void*)texasr, (void *)i)) + return EXIT_FAILURE; + } + if (pthread_join(thread, NULL) != 0) + return EXIT_FAILURE; + + if (passed) + return 0; + else + return 1; +} + +int main(int argc, char *argv[]) +{ + if (argc > 1) { + if (strcmp(argv[1], "-h") == 0) { + printf("Syntax:\t []\n"); + return 0; + } else { + num_loops = atoi(argv[1]); + } + } + return test_harness(test_tmspr, "tm_tmspr"); +} diff --git a/tools/testing/selftests/powerpc/utils.h b/tools/testing/selftests/powerpc/utils.h index 175ac6ad10dd..a985cfaa535e 100644 --- a/tools/testing/selftests/powerpc/utils.h +++ b/tools/testing/selftests/powerpc/utils.h @@ -6,9 +6,12 @@ #ifndef _SELFTESTS_POWERPC_UTILS_H #define _SELFTESTS_POWERPC_UTILS_H +#define __cacheline_aligned __attribute__((aligned(128))) + #include #include #include +#include "reg.h" /* Avoid headaches with PRI?64 - just use %ll? always */ typedef unsigned long long u64; @@ -54,4 +57,9 @@ do { \ #define _str(s) #s #define str(s) _str(s) +/* POWER9 feature */ +#ifndef PPC_FEATURE2_ARCH_3_00 +#define PPC_FEATURE2_ARCH_3_00 0x00800000 +#endif + #endif /* _SELFTESTS_POWERPC_UTILS_H */