From f773f0a331d6c41733b17bebbc1b6cae12e016f5 Mon Sep 17 00:00:00 2001 From: ZhaoLong Wang Date: Sat, 4 Mar 2023 09:41:41 +0800 Subject: [PATCH 001/175] ubi: Fix deadlock caused by recursively holding work_sem During the processing of the bgt, if the sync_erase() return -EBUSY or some other error code in __erase_worker(),schedule_erase() called again lead to the down_read(ubi->work_sem) hold twice and may get block by down_write(ubi->work_sem) in ubi_update_fastmap(), which cause deadlock. ubi bgt other task do_work down_read(&ubi->work_sem) ubi_update_fastmap erase_worker # Blocked by down_read __erase_worker down_write(&ubi->work_sem) schedule_erase schedule_ubi_work down_read(&ubi->work_sem) Fix this by changing input parameter @nested of the schedule_erase() to 'true' to avoid recursively acquiring the down_read(&ubi->work_sem). Also, fix the incorrect comment about @nested parameter of the schedule_erase() because when down_write(ubi->work_sem) is held, the @nested is also need be true. Link: https://bugzilla.kernel.org/show_bug.cgi?id=217093 Fixes: 2e8f08deabbc ("ubi: Fix races around ubi_refill_pools()") Signed-off-by: ZhaoLong Wang Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- drivers/mtd/ubi/wl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/mtd/ubi/wl.c b/drivers/mtd/ubi/wl.c index 40f39e5d6dfc..26a214f016c1 100644 --- a/drivers/mtd/ubi/wl.c +++ b/drivers/mtd/ubi/wl.c @@ -575,7 +575,7 @@ static int erase_worker(struct ubi_device *ubi, struct ubi_work *wl_wrk, * @vol_id: the volume ID that last used this PEB * @lnum: the last used logical eraseblock number for the PEB * @torture: if the physical eraseblock has to be tortured - * @nested: denotes whether the work_sem is already held in read mode + * @nested: denotes whether the work_sem is already held * * This function returns zero in case of success and a %-ENOMEM in case of * failure. @@ -1131,7 +1131,7 @@ static int __erase_worker(struct ubi_device *ubi, struct ubi_work *wl_wrk) int err1; /* Re-schedule the LEB for erasure */ - err1 = schedule_erase(ubi, e, vol_id, lnum, 0, false); + err1 = schedule_erase(ubi, e, vol_id, lnum, 0, true); if (err1) { spin_lock(&ubi->wl_lock); wl_entry_destroy(ubi, e); From 0643bedf921efd88df73847fb0bb31f9f8692ce0 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 7 Feb 2023 17:47:49 -0600 Subject: [PATCH 002/175] arm64: dts: rockchip: Fix rk3399 GICv3 ITS node name The GICv3 ITS is an MSI controller, therefore its node name should be 'msi-controller'. Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20230207234750.202154-1-robh@kernel.org Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3399.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3399.dtsi b/arch/arm64/boot/dts/rockchip/rk3399.dtsi index 1881b4b71f91..40e7c4a70055 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3399.dtsi @@ -552,7 +552,7 @@ <0x0 0xfff10000 0 0x10000>, /* GICH */ <0x0 0xfff20000 0 0x10000>; /* GICV */ interrupts = ; - its: interrupt-controller@fee20000 { + its: msi-controller@fee20000 { compatible = "arm,gic-v3-its"; msi-controller; #msi-cells = <1>; From 02c84f91adb9a64b75ec97d772675c02a3e65ed7 Mon Sep 17 00:00:00 2001 From: Jianqun Xu Date: Wed, 8 Feb 2023 17:14:11 +0800 Subject: [PATCH 003/175] ARM: dts: rockchip: fix a typo error for rk3288 spdif node Fix the address in the spdif node name. Fixes: 874e568e500a ("ARM: dts: rockchip: Add SPDIF transceiver for RK3288") Signed-off-by: Jianqun Xu Reviewed-by: Sjoerd Simons Link: https://lore.kernel.org/r/20230208091411.1603142-1-jay.xu@rock-chips.com Signed-off-by: Heiko Stuebner --- arch/arm/boot/dts/rk3288.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/rk3288.dtsi b/arch/arm/boot/dts/rk3288.dtsi index 2ca76b69add7..511ca864c1b2 100644 --- a/arch/arm/boot/dts/rk3288.dtsi +++ b/arch/arm/boot/dts/rk3288.dtsi @@ -942,7 +942,7 @@ status = "disabled"; }; - spdif: sound@ff88b0000 { + spdif: sound@ff8b0000 { compatible = "rockchip,rk3288-spdif", "rockchip,rk3066-spdif"; reg = <0x0 0xff8b0000 0x0 0x10000>; #sound-dai-cells = <0>; From 5912b647bd0732ae8c78a6e5b259c82efd177d93 Mon Sep 17 00:00:00 2001 From: Dan Johansen Date: Sat, 4 Mar 2023 17:41:35 +0100 Subject: [PATCH 004/175] arm64: dts: rockchip: Lower sd speed on rk3566-soquartz Just like the Quartz64 Model B the previously stated speed of sdr-104 in soquartz is too high for the hardware to reliably communicate with some fast SD cards. Especially on some carrierboards. Lower this to sd-uhs-sdr50 to fix this. Fixes: 5859b5a9c3ac ("arm64: dts: rockchip: add SoQuartz CM4IO dts") Signed-off-by: Dan Johansen Acked-by: Peter Geis Link: https://lore.kernel.org/r/20230304164135.28430-1-strit@manjaro.org Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3566-soquartz.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3566-soquartz.dtsi b/arch/arm64/boot/dts/rockchip/rk3566-soquartz.dtsi index ce7165d7f1a1..102e448bc026 100644 --- a/arch/arm64/boot/dts/rockchip/rk3566-soquartz.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3566-soquartz.dtsi @@ -598,7 +598,7 @@ non-removable; pinctrl-names = "default"; pinctrl-0 = <&sdmmc1_bus4 &sdmmc1_cmd &sdmmc1_clk>; - sd-uhs-sdr104; + sd-uhs-sdr50; vmmc-supply = <&vcc3v3_sys>; vqmmc-supply = <&vcc_1v8>; status = "okay"; From 78aedee18a86abb0bb8e31d994467c46656e9b5d Mon Sep 17 00:00:00 2001 From: Dan Johansen Date: Sun, 5 Mar 2023 11:47:31 +0100 Subject: [PATCH 005/175] arm64: dts: rockchip: Lower SD card speed on rk3399 Pinebook Pro MicroSD card slot in the Pinebook Pro is located on a separate daughterboard that's connected to the mainboard using a rather long flat cable. The resulting signal degradation causes many perfectly fine microSD cards not to work in the Pinebook Pro, which is a common source of frustration among the owners. Changing the mode and lowering the speed reportedly fixes this issue and makes many microSD cards work as expected. Co-developed-by: Dragan Simic Signed-off-by: Dragan Simic Tested-by: JR Gonzalez Signed-off-by: Dan Johansen Link: https://lore.kernel.org/r/20230305104730.15849-1-strit@manjaro.org Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3399-pinebook-pro.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3399-pinebook-pro.dts b/arch/arm64/boot/dts/rockchip/rk3399-pinebook-pro.dts index 194e48c755f6..54bb0398128f 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399-pinebook-pro.dts +++ b/arch/arm64/boot/dts/rockchip/rk3399-pinebook-pro.dts @@ -943,7 +943,7 @@ disable-wp; pinctrl-names = "default"; pinctrl-0 = <&sdmmc_clk &sdmmc_cmd &sdmmc_bus4>; - sd-uhs-sdr104; + sd-uhs-sdr50; vmmc-supply = <&vcc3v0_sd>; vqmmc-supply = <&vcc_sdio>; status = "okay"; From 172fa6366c0c84eda31f1bc34e6c3e4698786215 Mon Sep 17 00:00:00 2001 From: Jules Maselbas Date: Wed, 22 Feb 2023 18:30:09 +0100 Subject: [PATCH 006/175] tee: optee: Fix typo Unuspported -> Unsupported Fix typo Unuspported -> Unsupported Signed-off-by: Jules Maselbas Reviewed-by: Sumit Garg Signed-off-by: Jens Wiklander --- drivers/tee/optee/call.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tee/optee/call.c b/drivers/tee/optee/call.c index 290b1bb0e9cd..df5fb5410b72 100644 --- a/drivers/tee/optee/call.c +++ b/drivers/tee/optee/call.c @@ -488,7 +488,7 @@ static bool is_normal_memory(pgprot_t p) #elif defined(CONFIG_ARM64) return (pgprot_val(p) & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL); #else -#error "Unuspported architecture" +#error "Unsupported architecture" #endif } From 47d43086531f10539470a63e8ad92803e686a3dd Mon Sep 17 00:00:00 2001 From: Chunyan Zhang Date: Thu, 16 Mar 2023 10:36:24 +0800 Subject: [PATCH 007/175] clk: sprd: set max_register according to mapping range In sprd clock driver, regmap_config.max_register was set to a fixed value which is likely larger than the address range configured in device tree, when reading registers through debugfs it would cause access violation. Fixes: d41f59fd92f2 ("clk: sprd: Add common infrastructure") Signed-off-by: Chunyan Zhang Link: https://lore.kernel.org/r/20230316023624.758204-1-chunyan.zhang@unisoc.com Signed-off-by: Stephen Boyd --- drivers/clk/sprd/common.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/clk/sprd/common.c b/drivers/clk/sprd/common.c index ce81e4087a8f..2bfbab8db94b 100644 --- a/drivers/clk/sprd/common.c +++ b/drivers/clk/sprd/common.c @@ -17,7 +17,6 @@ static const struct regmap_config sprdclk_regmap_config = { .reg_bits = 32, .reg_stride = 4, .val_bits = 32, - .max_register = 0xffff, .fast_io = true, }; @@ -43,6 +42,8 @@ int sprd_clk_regmap_init(struct platform_device *pdev, struct device *dev = &pdev->dev; struct device_node *node = dev->of_node, *np; struct regmap *regmap; + struct resource *res; + struct regmap_config reg_config = sprdclk_regmap_config; if (of_find_property(node, "sprd,syscon", NULL)) { regmap = syscon_regmap_lookup_by_phandle(node, "sprd,syscon"); @@ -59,12 +60,14 @@ int sprd_clk_regmap_init(struct platform_device *pdev, return PTR_ERR(regmap); } } else { - base = devm_platform_ioremap_resource(pdev, 0); + base = devm_platform_get_and_ioremap_resource(pdev, 0, &res); if (IS_ERR(base)) return PTR_ERR(base); + reg_config.max_register = resource_size(res) - reg_config.reg_stride; + regmap = devm_regmap_init_mmio(&pdev->dev, base, - &sprdclk_regmap_config); + ®_config); if (IS_ERR(regmap)) { pr_err("failed to init regmap\n"); return PTR_ERR(regmap); From fcdb1eda5302599045bb366e679cccb4216f3873 Mon Sep 17 00:00:00 2001 From: Josh Don Date: Wed, 15 Mar 2023 14:40:29 -0700 Subject: [PATCH 008/175] cgroup: fix display of forceidle time at root We need to reset forceidle_sum to 0 when reading from root, since the bstat we accumulate into is stack allocated. To make this more robust, just replace the existing cputime reset with a memset of the overall bstat. Signed-off-by: Josh Don Fixes: 1fcf54deb767 ("sched/core: add forced idle accounting for cgroups") Cc: stable@vger.kernel.org # v6.0+ Signed-off-by: Tejun Heo --- kernel/cgroup/rstat.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 831f1f472bb8..0a2b4967e333 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -457,9 +457,7 @@ static void root_cgroup_cputime(struct cgroup_base_stat *bstat) struct task_cputime *cputime = &bstat->cputime; int i; - cputime->stime = 0; - cputime->utime = 0; - cputime->sum_exec_runtime = 0; + memset(bstat, 0, sizeof(*bstat)); for_each_possible_cpu(i) { struct kernel_cpustat kcpustat; u64 *cpustat = kcpustat.cpustat; From 30ed9ee9a10a90ae719dcfcacead1d0506fa45ed Mon Sep 17 00:00:00 2001 From: Mustafa Ismail Date: Wed, 15 Mar 2023 09:52:28 -0500 Subject: [PATCH 009/175] RDMA/irdma: Do not generate SW completions for NOPs Currently, artificial SW completions are generated for NOP wqes which can generate unexpected completions with wr_id = 0. Skip the generation of artificial completions for NOPs. Fixes: 81091d7696ae ("RDMA/irdma: Add SW mechanism to generate completions on error") Signed-off-by: Mustafa Ismail Signed-off-by: Shiraz Saleem Link: https://lore.kernel.org/r/20230315145231.931-2-shiraz.saleem@intel.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/irdma/utils.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/irdma/utils.c b/drivers/infiniband/hw/irdma/utils.c index 445e69e86409..7887230c867b 100644 --- a/drivers/infiniband/hw/irdma/utils.c +++ b/drivers/infiniband/hw/irdma/utils.c @@ -2595,7 +2595,10 @@ void irdma_generate_flush_completions(struct irdma_qp *iwqp) /* remove the SQ WR by moving SQ tail*/ IRDMA_RING_SET_TAIL(*sq_ring, sq_ring->tail + qp->sq_wrtrk_array[sq_ring->tail].quanta); - + if (cmpl->cpi.op_type == IRDMAQP_OP_NOP) { + kfree(cmpl); + continue; + } ibdev_dbg(iwqp->iwscq->ibcq.device, "DEV: %s: adding wr_id = 0x%llx SQ Completion to list qp_id=%d\n", __func__, cmpl->cpi.wr_id, qp->qp_id); From b69a6979dbaa2453675fe9c71bdc2497fedb11f9 Mon Sep 17 00:00:00 2001 From: Mustafa Ismail Date: Wed, 15 Mar 2023 09:52:29 -0500 Subject: [PATCH 010/175] RDMA/irdma: Fix memory leak of PBLE objects On rmmod of irdma, the PBLE object memory is not being freed. PBLE object memory are not statically pre-allocated at function initialization time unlike other HMC objects. PBLEs objects and the Segment Descriptors (SD) for it can be dynamically allocated during scale up and SD's remain allocated till function deinitialization. Fix this leak by adding IRDMA_HMC_IW_PBLE to the iw_hmc_obj_types[] table and skip pbles in irdma_create_hmc_obj but not in irdma_del_hmc_objects(). Fixes: 44d9e52977a1 ("RDMA/irdma: Implement device initialization definitions") Signed-off-by: Mustafa Ismail Signed-off-by: Shiraz Saleem Link: https://lore.kernel.org/r/20230315145231.931-3-shiraz.saleem@intel.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/irdma/hw.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/infiniband/hw/irdma/hw.c b/drivers/infiniband/hw/irdma/hw.c index 2e1e2bad0401..43dfa4761f06 100644 --- a/drivers/infiniband/hw/irdma/hw.c +++ b/drivers/infiniband/hw/irdma/hw.c @@ -41,6 +41,7 @@ static enum irdma_hmc_rsrc_type iw_hmc_obj_types[] = { IRDMA_HMC_IW_XFFL, IRDMA_HMC_IW_Q1, IRDMA_HMC_IW_Q1FL, + IRDMA_HMC_IW_PBLE, IRDMA_HMC_IW_TIMER, IRDMA_HMC_IW_FSIMC, IRDMA_HMC_IW_FSIAV, @@ -827,6 +828,8 @@ static int irdma_create_hmc_objs(struct irdma_pci_f *rf, bool privileged, info.entry_type = rf->sd_type; for (i = 0; i < IW_HMC_OBJ_TYPE_NUM; i++) { + if (iw_hmc_obj_types[i] == IRDMA_HMC_IW_PBLE) + continue; if (dev->hmc_info->hmc_obj[iw_hmc_obj_types[i]].cnt) { info.rsrc_type = iw_hmc_obj_types[i]; info.count = dev->hmc_info->hmc_obj[info.rsrc_type].cnt; From 8385a875c9eecc429b2f72970efcbb0e5cb5b547 Mon Sep 17 00:00:00 2001 From: Mustafa Ismail Date: Wed, 15 Mar 2023 09:52:30 -0500 Subject: [PATCH 011/175] RDMA/irdma: Increase iWARP CM default rexmit count When running perftest with large number of connections in iWARP mode, the passive side could be slow to respond. Increase the rexmit counter default to allow scaling connections. Fixes: 146b9756f14c ("RDMA/irdma: Add connection manager") Signed-off-by: Mustafa Ismail Signed-off-by: Shiraz Saleem Link: https://lore.kernel.org/r/20230315145231.931-4-shiraz.saleem@intel.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/irdma/cm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/irdma/cm.h b/drivers/infiniband/hw/irdma/cm.h index 19c284975fc7..7feadb3e1eda 100644 --- a/drivers/infiniband/hw/irdma/cm.h +++ b/drivers/infiniband/hw/irdma/cm.h @@ -41,7 +41,7 @@ #define TCP_OPTIONS_PADDING 3 #define IRDMA_DEFAULT_RETRYS 64 -#define IRDMA_DEFAULT_RETRANS 8 +#define IRDMA_DEFAULT_RETRANS 32 #define IRDMA_DEFAULT_TTL 0x40 #define IRDMA_DEFAULT_RTT_VAR 6 #define IRDMA_DEFAULT_SS_THRESH 0x3fffffff From e4522c097ec10f23ea0933e9e69d4fa9d8ae9441 Mon Sep 17 00:00:00 2001 From: Tatyana Nikolova Date: Wed, 15 Mar 2023 09:52:31 -0500 Subject: [PATCH 012/175] RDMA/irdma: Add ipv4 check to irdma_find_listener() Add ipv4 check to irdma_find_listener(). Otherwise the function incorrectly finds and returns a listener with a different addr family for the zero IP addr, if a listener with a zero IP addr and the same port as the one searched for has already been created. Fixes: 146b9756f14c ("RDMA/irdma: Add connection manager") Signed-off-by: Tatyana Nikolova Signed-off-by: Shiraz Saleem Link: https://lore.kernel.org/r/20230315145231.931-5-shiraz.saleem@intel.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/irdma/cm.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/irdma/cm.c b/drivers/infiniband/hw/irdma/cm.c index 195aa9ea18b6..8817864154af 100644 --- a/drivers/infiniband/hw/irdma/cm.c +++ b/drivers/infiniband/hw/irdma/cm.c @@ -1458,13 +1458,15 @@ static int irdma_send_fin(struct irdma_cm_node *cm_node) * irdma_find_listener - find a cm node listening on this addr-port pair * @cm_core: cm's core * @dst_addr: listener ip addr + * @ipv4: flag indicating IPv4 when true * @dst_port: listener tcp port num * @vlan_id: virtual LAN ID * @listener_state: state to match with listen node's */ static struct irdma_cm_listener * -irdma_find_listener(struct irdma_cm_core *cm_core, u32 *dst_addr, u16 dst_port, - u16 vlan_id, enum irdma_cm_listener_state listener_state) +irdma_find_listener(struct irdma_cm_core *cm_core, u32 *dst_addr, bool ipv4, + u16 dst_port, u16 vlan_id, + enum irdma_cm_listener_state listener_state) { struct irdma_cm_listener *listen_node; static const u32 ip_zero[4] = { 0, 0, 0, 0 }; @@ -1477,7 +1479,7 @@ irdma_find_listener(struct irdma_cm_core *cm_core, u32 *dst_addr, u16 dst_port, list_for_each_entry (listen_node, &cm_core->listen_list, list) { memcpy(listen_addr, listen_node->loc_addr, sizeof(listen_addr)); listen_port = listen_node->loc_port; - if (listen_port != dst_port || + if (listen_node->ipv4 != ipv4 || listen_port != dst_port || !(listener_state & listen_node->listener_state)) continue; /* compare node pair, return node handle if a match */ @@ -2902,9 +2904,10 @@ irdma_make_listen_node(struct irdma_cm_core *cm_core, unsigned long flags; /* cannot have multiple matching listeners */ - listener = irdma_find_listener(cm_core, cm_info->loc_addr, - cm_info->loc_port, cm_info->vlan_id, - IRDMA_CM_LISTENER_EITHER_STATE); + listener = + irdma_find_listener(cm_core, cm_info->loc_addr, cm_info->ipv4, + cm_info->loc_port, cm_info->vlan_id, + IRDMA_CM_LISTENER_EITHER_STATE); if (listener && listener->listener_state == IRDMA_CM_LISTENER_ACTIVE_STATE) { refcount_dec(&listener->refcnt); @@ -3153,6 +3156,7 @@ void irdma_receive_ilq(struct irdma_sc_vsi *vsi, struct irdma_puda_buf *rbuf) listener = irdma_find_listener(cm_core, cm_info.loc_addr, + cm_info.ipv4, cm_info.loc_port, cm_info.vlan_id, IRDMA_CM_LISTENER_ACTIVE_STATE); From 88c9483faf15ada14eca82714114656893063458 Mon Sep 17 00:00:00 2001 From: Maher Sanalla Date: Thu, 16 Mar 2023 15:40:49 +0200 Subject: [PATCH 013/175] IB/mlx5: Add support for 400G_8X lane speed Currently, when driver queries PTYS to report which link speed is being used on its RoCE ports, it does not check the case of having 400Gbps transmitted over 8 lanes. Thus it fails to report the said speed and instead it defaults to report 10G over 4 lanes. Add a check for the said speed when querying PTYS and report it back correctly when needed. Fixes: 08e8676f1607 ("IB/mlx5: Add support for 50Gbps per lane link modes") Signed-off-by: Maher Sanalla Reviewed-by: Aya Levin Reviewed-by: Saeed Mahameed Link: https://lore.kernel.org/r/ec9040548d119d22557d6a4b4070d6f421701fd4.1678973994.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/main.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 5b988db66b8f..5d45de223c43 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -442,6 +442,10 @@ static int translate_eth_ext_proto_oper(u32 eth_proto_oper, u16 *active_speed, *active_width = IB_WIDTH_2X; *active_speed = IB_SPEED_NDR; break; + case MLX5E_PROT_MASK(MLX5E_400GAUI_8): + *active_width = IB_WIDTH_8X; + *active_speed = IB_SPEED_HDR; + break; case MLX5E_PROT_MASK(MLX5E_400GAUI_4_400GBASE_CR4_KR4): *active_width = IB_WIDTH_4X; *active_speed = IB_SPEED_NDR; From 3fe26c0493e4c2da4b7d8ba8c975a6f48fb75ec2 Mon Sep 17 00:00:00 2001 From: Cheng Xu Date: Mon, 20 Mar 2023 16:46:49 +0800 Subject: [PATCH 014/175] RDMA/erdma: Fix some typos FAA is short for atomic fetch and add, not FAD. Fix this. Fixes: 0ca9c2e2844a ("RDMA/erdma: Implement atomic operations support") Signed-off-by: Cheng Xu Link: https://lore.kernel.org/r/20230320084652.16807-2-chengyou@linux.alibaba.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/erdma/erdma_cq.c | 2 +- drivers/infiniband/hw/erdma/erdma_hw.h | 2 +- drivers/infiniband/hw/erdma/erdma_qp.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/erdma/erdma_cq.c b/drivers/infiniband/hw/erdma/erdma_cq.c index cabd8678b355..7bc354273d4e 100644 --- a/drivers/infiniband/hw/erdma/erdma_cq.c +++ b/drivers/infiniband/hw/erdma/erdma_cq.c @@ -65,7 +65,7 @@ static const enum ib_wc_opcode wc_mapping_table[ERDMA_NUM_OPCODES] = { [ERDMA_OP_LOCAL_INV] = IB_WC_LOCAL_INV, [ERDMA_OP_READ_WITH_INV] = IB_WC_RDMA_READ, [ERDMA_OP_ATOMIC_CAS] = IB_WC_COMP_SWAP, - [ERDMA_OP_ATOMIC_FAD] = IB_WC_FETCH_ADD, + [ERDMA_OP_ATOMIC_FAA] = IB_WC_FETCH_ADD, }; static const struct { diff --git a/drivers/infiniband/hw/erdma/erdma_hw.h b/drivers/infiniband/hw/erdma/erdma_hw.h index 4c38d99c73f1..5d3a541db941 100644 --- a/drivers/infiniband/hw/erdma/erdma_hw.h +++ b/drivers/infiniband/hw/erdma/erdma_hw.h @@ -491,7 +491,7 @@ enum erdma_opcode { ERDMA_OP_LOCAL_INV = 15, ERDMA_OP_READ_WITH_INV = 16, ERDMA_OP_ATOMIC_CAS = 17, - ERDMA_OP_ATOMIC_FAD = 18, + ERDMA_OP_ATOMIC_FAA = 18, ERDMA_NUM_OPCODES = 19, ERDMA_OP_INVALID = ERDMA_NUM_OPCODES + 1 }; diff --git a/drivers/infiniband/hw/erdma/erdma_qp.c b/drivers/infiniband/hw/erdma/erdma_qp.c index d088d6bef431..ff473b208acf 100644 --- a/drivers/infiniband/hw/erdma/erdma_qp.c +++ b/drivers/infiniband/hw/erdma/erdma_qp.c @@ -439,7 +439,7 @@ static int erdma_push_one_sqe(struct erdma_qp *qp, u16 *pi, cpu_to_le64(atomic_wr(send_wr)->compare_add); } else { wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, - ERDMA_OP_ATOMIC_FAD); + ERDMA_OP_ATOMIC_FAA); atomic_sqe->fetchadd_swap_data = cpu_to_le64(atomic_wr(send_wr)->compare_add); } From 6256aa9ae955d10ec73a434533ca62034eff1b76 Mon Sep 17 00:00:00 2001 From: Cheng Xu Date: Mon, 20 Mar 2023 16:46:50 +0800 Subject: [PATCH 015/175] RDMA/erdma: Update default EQ depth to 4096 and max_send_wr to 8192 Max EQ depth of hardware is 32K, the current default EQ depth is too small for some applications, so change the default depth to 4096. Max send WRs the hardware can support is 8K, but the driver limits the value to 4K. Remove this limitation. Fixes: be3cff0f242d ("RDMA/erdma: Add the hardware related definitions") Fixes: db23ae64caac ("RDMA/erdma: Add verbs header file") Signed-off-by: Cheng Xu Link: https://lore.kernel.org/r/20230320084652.16807-3-chengyou@linux.alibaba.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/erdma/erdma_hw.h | 2 +- drivers/infiniband/hw/erdma/erdma_verbs.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/erdma/erdma_hw.h b/drivers/infiniband/hw/erdma/erdma_hw.h index 5d3a541db941..37ad1bb1917c 100644 --- a/drivers/infiniband/hw/erdma/erdma_hw.h +++ b/drivers/infiniband/hw/erdma/erdma_hw.h @@ -441,7 +441,7 @@ struct erdma_reg_mr_sqe { }; /* EQ related. */ -#define ERDMA_DEFAULT_EQ_DEPTH 256 +#define ERDMA_DEFAULT_EQ_DEPTH 4096 /* ceqe */ #define ERDMA_CEQE_HDR_DB_MASK BIT_ULL(63) diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.h b/drivers/infiniband/hw/erdma/erdma_verbs.h index e0a993bc032a..131cf5f40982 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.h +++ b/drivers/infiniband/hw/erdma/erdma_verbs.h @@ -11,7 +11,7 @@ /* RDMA Capability. */ #define ERDMA_MAX_PD (128 * 1024) -#define ERDMA_MAX_SEND_WR 4096 +#define ERDMA_MAX_SEND_WR 8192 #define ERDMA_MAX_ORD 128 #define ERDMA_MAX_IRD 128 #define ERDMA_MAX_SGE_RD 1 From 0dd83a4d7756713f81990d6c5547500f212a1190 Mon Sep 17 00:00:00 2001 From: Cheng Xu Date: Mon, 20 Mar 2023 16:46:51 +0800 Subject: [PATCH 016/175] RDMA/erdma: Inline mtt entries into WQE if supported The max inline mtt count supported is ERDMA_MAX_INLINE_MTT_ENTRIES. When mr->mem.mtt_nents == ERDMA_MAX_INLINE_MTT_ENTRIES, inline mtt is also supported, fix it. Fixes: 155055771704 ("RDMA/erdma: Add verbs implementation") Signed-off-by: Cheng Xu Link: https://lore.kernel.org/r/20230320084652.16807-4-chengyou@linux.alibaba.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/erdma/erdma_qp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/erdma/erdma_qp.c b/drivers/infiniband/hw/erdma/erdma_qp.c index ff473b208acf..44923c51a01b 100644 --- a/drivers/infiniband/hw/erdma/erdma_qp.c +++ b/drivers/infiniband/hw/erdma/erdma_qp.c @@ -405,7 +405,7 @@ static int erdma_push_one_sqe(struct erdma_qp *qp, u16 *pi, FIELD_PREP(ERDMA_SQE_MR_MTT_CNT_MASK, mr->mem.mtt_nents); - if (mr->mem.mtt_nents < ERDMA_MAX_INLINE_MTT_ENTRIES) { + if (mr->mem.mtt_nents <= ERDMA_MAX_INLINE_MTT_ENTRIES) { attrs |= FIELD_PREP(ERDMA_SQE_MR_MTT_TYPE_MASK, 0); /* Copy SGLs to SQE content to accelerate */ memcpy(get_queue_entry(qp->kern_qp.sq_buf, idx + 1, From 6bd1bca858f1734a75572a788213d1e1143f2f0a Mon Sep 17 00:00:00 2001 From: Cheng Xu Date: Mon, 20 Mar 2023 16:46:52 +0800 Subject: [PATCH 017/175] RDMA/erdma: Defer probing if netdevice can not be found ERDMA device may be probed before its associated netdevice, returning -EPROBE_DEFER allows OS try to probe erdma device later. Fixes: d55e6fb4803c ("RDMA/erdma: Add the erdma module") Signed-off-by: Cheng Xu Link: https://lore.kernel.org/r/20230320084652.16807-5-chengyou@linux.alibaba.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/erdma/erdma_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/erdma/erdma_main.c b/drivers/infiniband/hw/erdma/erdma_main.c index 5dc31e5df5cb..4a29a53a6652 100644 --- a/drivers/infiniband/hw/erdma/erdma_main.c +++ b/drivers/infiniband/hw/erdma/erdma_main.c @@ -56,7 +56,7 @@ done: static int erdma_enum_and_get_netdev(struct erdma_dev *dev) { struct net_device *netdev; - int ret = -ENODEV; + int ret = -EPROBE_DEFER; /* Already binded to a net_device, so we skip. */ if (dev->netdev) From f420f47e56c67587d9bc8f94267327b6fb214c1d Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Fri, 10 Mar 2023 17:45:23 +0100 Subject: [PATCH 018/175] clk: imx6ul: fix "failed to get parent" error On some configuration we may get following error: [ 0.000000] imx:clk-gpr-mux: failed to get parent (-EINVAL) This happens if selector is configured to not supported value. To avoid this warnings add dummy parents for not supported values. Fixes: 4e197ee880c2 ("clk: imx6ul: add ethernet refclock mux support") Signed-off-by: Oleksij Rempel Link: https://lore.kernel.org/r/20230310164523.534571-1-o.rempel@pengutronix.de Tested-by: Stefan Wahren Reported-by: Stefan Wahren Reviewed-by: Peng Fan Signed-off-by: Stephen Boyd --- drivers/clk/imx/clk-imx6ul.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/clk/imx/clk-imx6ul.c b/drivers/clk/imx/clk-imx6ul.c index 2836adb817b7..e3696a88b5a3 100644 --- a/drivers/clk/imx/clk-imx6ul.c +++ b/drivers/clk/imx/clk-imx6ul.c @@ -95,14 +95,16 @@ static const struct clk_div_table video_div_table[] = { { } }; -static const char * enet1_ref_sels[] = { "enet1_ref_125m", "enet1_ref_pad", }; +static const char * enet1_ref_sels[] = { "enet1_ref_125m", "enet1_ref_pad", "dummy", "dummy"}; static const u32 enet1_ref_sels_table[] = { IMX6UL_GPR1_ENET1_TX_CLK_DIR, - IMX6UL_GPR1_ENET1_CLK_SEL }; + IMX6UL_GPR1_ENET1_CLK_SEL, 0, + IMX6UL_GPR1_ENET1_TX_CLK_DIR | IMX6UL_GPR1_ENET1_CLK_SEL }; static const u32 enet1_ref_sels_table_mask = IMX6UL_GPR1_ENET1_TX_CLK_DIR | IMX6UL_GPR1_ENET1_CLK_SEL; -static const char * enet2_ref_sels[] = { "enet2_ref_125m", "enet2_ref_pad", }; +static const char * enet2_ref_sels[] = { "enet2_ref_125m", "enet2_ref_pad", "dummy", "dummy"}; static const u32 enet2_ref_sels_table[] = { IMX6UL_GPR1_ENET2_TX_CLK_DIR, - IMX6UL_GPR1_ENET2_CLK_SEL }; + IMX6UL_GPR1_ENET2_CLK_SEL, 0, + IMX6UL_GPR1_ENET2_TX_CLK_DIR | IMX6UL_GPR1_ENET2_CLK_SEL }; static const u32 enet2_ref_sels_table_mask = IMX6UL_GPR1_ENET2_TX_CLK_DIR | IMX6UL_GPR1_ENET2_CLK_SEL; From 632e04739c8f45c2d9ca4d4c5bd18d80c2ac9296 Mon Sep 17 00:00:00 2001 From: Alexander Stein Date: Fri, 10 Mar 2023 08:49:40 +0100 Subject: [PATCH 019/175] clk: rs9: Fix suspend/resume Disabling the cache in commit 2ff4ba9e3702 ("clk: rs9: Fix I2C accessors") without removing cache synchronization in resume path results in a kernel panic as map->cache_ops is unset, due to REGCACHE_NONE. Enable flat cache again to support resume again. num_reg_defaults_raw is necessary to read the cache defaults from hardware. Some registers are strapped in hardware and cannot be provided in software. Fixes: 2ff4ba9e3702 ("clk: rs9: Fix I2C accessors") Signed-off-by: Alexander Stein Link: https://lore.kernel.org/r/20230310074940.3475703-1-alexander.stein@ew.tq-group.com Signed-off-by: Stephen Boyd --- drivers/clk/clk-renesas-pcie.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/clk/clk-renesas-pcie.c b/drivers/clk/clk-renesas-pcie.c index f91f30560820..ff3a52d48479 100644 --- a/drivers/clk/clk-renesas-pcie.c +++ b/drivers/clk/clk-renesas-pcie.c @@ -143,8 +143,9 @@ static int rs9_regmap_i2c_read(void *context, static const struct regmap_config rs9_regmap_config = { .reg_bits = 8, .val_bits = 8, - .cache_type = REGCACHE_NONE, + .cache_type = REGCACHE_FLAT, .max_register = RS9_REG_BCP, + .num_reg_defaults_raw = 0x8, .rd_table = &rs9_readable_table, .wr_table = &rs9_writeable_table, .reg_write = rs9_regmap_i2c_write, From b37115b6534c4027df75854a44b485596d368171 Mon Sep 17 00:00:00 2001 From: Sebastian Reichel Date: Fri, 17 Mar 2023 18:41:02 +0100 Subject: [PATCH 020/175] arm64: dts: rockchip: add rk3588 cache level information Add missing, mandatory cache-level information for RK3588. Signed-off-by: Sebastian Reichel Link: https://lore.kernel.org/r/20230317174102.61209-1-sebastian.reichel@collabora.com Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3588s.dtsi | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/arm64/boot/dts/rockchip/rk3588s.dtsi b/arch/arm64/boot/dts/rockchip/rk3588s.dtsi index 005cde61b4b2..a506948b5572 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588s.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3588s.dtsi @@ -222,6 +222,7 @@ cache-size = <131072>; cache-line-size = <64>; cache-sets = <512>; + cache-level = <2>; next-level-cache = <&l3_cache>; }; @@ -230,6 +231,7 @@ cache-size = <131072>; cache-line-size = <64>; cache-sets = <512>; + cache-level = <2>; next-level-cache = <&l3_cache>; }; @@ -238,6 +240,7 @@ cache-size = <131072>; cache-line-size = <64>; cache-sets = <512>; + cache-level = <2>; next-level-cache = <&l3_cache>; }; @@ -246,6 +249,7 @@ cache-size = <131072>; cache-line-size = <64>; cache-sets = <512>; + cache-level = <2>; next-level-cache = <&l3_cache>; }; @@ -254,6 +258,7 @@ cache-size = <524288>; cache-line-size = <64>; cache-sets = <1024>; + cache-level = <2>; next-level-cache = <&l3_cache>; }; @@ -262,6 +267,7 @@ cache-size = <524288>; cache-line-size = <64>; cache-sets = <1024>; + cache-level = <2>; next-level-cache = <&l3_cache>; }; @@ -270,6 +276,7 @@ cache-size = <524288>; cache-line-size = <64>; cache-sets = <1024>; + cache-level = <2>; next-level-cache = <&l3_cache>; }; @@ -278,6 +285,7 @@ cache-size = <524288>; cache-line-size = <64>; cache-sets = <1024>; + cache-level = <2>; next-level-cache = <&l3_cache>; }; @@ -286,6 +294,7 @@ cache-size = <3145728>; cache-line-size = <64>; cache-sets = <4096>; + cache-level = <3>; }; }; From 58e84f6b3e84e46524b7e5a916b53c1ad798bc8f Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Mon, 20 Mar 2023 12:59:55 +0200 Subject: [PATCH 021/175] RDMA/cma: Allow UD qp_type to join multicast only As for multicast: - The SIDR is the only mode that makes sense; - Besides PS_UDP, other port spaces like PS_IB is also allowed, as it is UD compatible. In this case qkey also needs to be set [1]. This patch allows only UD qp_type to join multicast, and set qkey to default if it's not set, to fix an uninit-value error: the ib->rec.qkey field is accessed without being initialized. ===================================================== BUG: KMSAN: uninit-value in cma_set_qkey drivers/infiniband/core/cma.c:510 [inline] BUG: KMSAN: uninit-value in cma_make_mc_event+0xb73/0xe00 drivers/infiniband/core/cma.c:4570 cma_set_qkey drivers/infiniband/core/cma.c:510 [inline] cma_make_mc_event+0xb73/0xe00 drivers/infiniband/core/cma.c:4570 cma_iboe_join_multicast drivers/infiniband/core/cma.c:4782 [inline] rdma_join_multicast+0x2b83/0x30a0 drivers/infiniband/core/cma.c:4814 ucma_process_join+0xa76/0xf60 drivers/infiniband/core/ucma.c:1479 ucma_join_multicast+0x1e3/0x250 drivers/infiniband/core/ucma.c:1546 ucma_write+0x639/0x6d0 drivers/infiniband/core/ucma.c:1732 vfs_write+0x8ce/0x2030 fs/read_write.c:588 ksys_write+0x28c/0x520 fs/read_write.c:643 __do_sys_write fs/read_write.c:655 [inline] __se_sys_write fs/read_write.c:652 [inline] __ia32_sys_write+0xdb/0x120 fs/read_write.c:652 do_syscall_32_irqs_on arch/x86/entry/common.c:114 [inline] __do_fast_syscall_32+0x96/0xf0 arch/x86/entry/common.c:180 do_fast_syscall_32+0x34/0x70 arch/x86/entry/common.c:205 do_SYSENTER_32+0x1b/0x20 arch/x86/entry/common.c:248 entry_SYSENTER_compat_after_hwframe+0x4d/0x5c Local variable ib.i created at: cma_iboe_join_multicast drivers/infiniband/core/cma.c:4737 [inline] rdma_join_multicast+0x586/0x30a0 drivers/infiniband/core/cma.c:4814 ucma_process_join+0xa76/0xf60 drivers/infiniband/core/ucma.c:1479 CPU: 0 PID: 29874 Comm: syz-executor.3 Not tainted 5.16.0-rc3-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 ===================================================== [1] https://lore.kernel.org/linux-rdma/20220117183832.GD84788@nvidia.com/ Fixes: b5de0c60cc30 ("RDMA/cma: Fix use after free race in roce multicast join") Reported-by: syzbot+8fcbb77276d43cc8b693@syzkaller.appspotmail.com Signed-off-by: Mark Zhang Link: https://lore.kernel.org/r/58a4a98323b5e6b1282e83f6b76960d06e43b9fa.1679309909.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/cma.c | 60 ++++++++++++++++++++--------------- 1 file changed, 34 insertions(+), 26 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 308155937713..6b9563d4f23c 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -624,22 +624,11 @@ static inline unsigned short cma_family(struct rdma_id_private *id_priv) return id_priv->id.route.addr.src_addr.ss_family; } -static int cma_set_qkey(struct rdma_id_private *id_priv, u32 qkey) +static int cma_set_default_qkey(struct rdma_id_private *id_priv) { struct ib_sa_mcmember_rec rec; int ret = 0; - if (id_priv->qkey) { - if (qkey && id_priv->qkey != qkey) - return -EINVAL; - return 0; - } - - if (qkey) { - id_priv->qkey = qkey; - return 0; - } - switch (id_priv->id.ps) { case RDMA_PS_UDP: case RDMA_PS_IB: @@ -659,6 +648,16 @@ static int cma_set_qkey(struct rdma_id_private *id_priv, u32 qkey) return ret; } +static int cma_set_qkey(struct rdma_id_private *id_priv, u32 qkey) +{ + if (!qkey || + (id_priv->qkey && (id_priv->qkey != qkey))) + return -EINVAL; + + id_priv->qkey = qkey; + return 0; +} + static void cma_translate_ib(struct sockaddr_ib *sib, struct rdma_dev_addr *dev_addr) { dev_addr->dev_type = ARPHRD_INFINIBAND; @@ -1229,7 +1228,7 @@ static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv, *qp_attr_mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT; if (id_priv->id.qp_type == IB_QPT_UD) { - ret = cma_set_qkey(id_priv, 0); + ret = cma_set_default_qkey(id_priv); if (ret) return ret; @@ -4569,7 +4568,10 @@ static int cma_send_sidr_rep(struct rdma_id_private *id_priv, memset(&rep, 0, sizeof rep); rep.status = status; if (status == IB_SIDR_SUCCESS) { - ret = cma_set_qkey(id_priv, qkey); + if (qkey) + ret = cma_set_qkey(id_priv, qkey); + else + ret = cma_set_default_qkey(id_priv); if (ret) return ret; rep.qp_num = id_priv->qp_num; @@ -4774,9 +4776,7 @@ static void cma_make_mc_event(int status, struct rdma_id_private *id_priv, enum ib_gid_type gid_type; struct net_device *ndev; - if (!status) - status = cma_set_qkey(id_priv, be32_to_cpu(multicast->rec.qkey)); - else + if (status) pr_debug_ratelimited("RDMA CM: MULTICAST_ERROR: failed to join multicast. status %d\n", status); @@ -4804,7 +4804,7 @@ static void cma_make_mc_event(int status, struct rdma_id_private *id_priv, } event->param.ud.qp_num = 0xFFFFFF; - event->param.ud.qkey = be32_to_cpu(multicast->rec.qkey); + event->param.ud.qkey = id_priv->qkey; out: if (ndev) @@ -4823,8 +4823,11 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING) goto out; - cma_make_mc_event(status, id_priv, multicast, &event, mc); - ret = cma_cm_event_handler(id_priv, &event); + ret = cma_set_qkey(id_priv, be32_to_cpu(multicast->rec.qkey)); + if (!ret) { + cma_make_mc_event(status, id_priv, multicast, &event, mc); + ret = cma_cm_event_handler(id_priv, &event); + } rdma_destroy_ah_attr(&event.param.ud.ah_attr); WARN_ON(ret); @@ -4877,9 +4880,11 @@ static int cma_join_ib_multicast(struct rdma_id_private *id_priv, if (ret) return ret; - ret = cma_set_qkey(id_priv, 0); - if (ret) - return ret; + if (!id_priv->qkey) { + ret = cma_set_default_qkey(id_priv); + if (ret) + return ret; + } cma_set_mgid(id_priv, (struct sockaddr *) &mc->addr, &rec.mgid); rec.qkey = cpu_to_be32(id_priv->qkey); @@ -4956,9 +4961,6 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, cma_iboe_set_mgid(addr, &ib.rec.mgid, gid_type); ib.rec.pkey = cpu_to_be16(0xffff); - if (id_priv->id.ps == RDMA_PS_UDP) - ib.rec.qkey = cpu_to_be32(RDMA_UDP_QKEY); - if (dev_addr->bound_dev_if) ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); if (!ndev) @@ -4984,6 +4986,9 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, if (err || !ib.rec.mtu) return err ?: -EINVAL; + if (!id_priv->qkey) + cma_set_default_qkey(id_priv); + rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, &ib.rec.port_gid); INIT_WORK(&mc->iboe_join.work, cma_iboe_join_work_handler); @@ -5009,6 +5014,9 @@ int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, READ_ONCE(id_priv->state) != RDMA_CM_ADDR_RESOLVED)) return -EINVAL; + if (id_priv->id.qp_type != IB_QPT_UD) + return -EINVAL; + mc = kzalloc(sizeof(*mc), GFP_KERNEL); if (!mc) return -ENOMEM; From 2265098fd6a6272fde3fd1be5761f2f5895bd99a Mon Sep 17 00:00:00 2001 From: Bhavya Kapoor Date: Fri, 17 Mar 2023 14:57:11 +0530 Subject: [PATCH 022/175] mmc: sdhci_am654: Set HIGH_SPEED_ENA for SDR12 and SDR25 Timing Information in Datasheet assumes that HIGH_SPEED_ENA=1 should be set for SDR12 and SDR25 modes. But sdhci_am654 driver clears HIGH_SPEED_ENA register. Thus, Modify sdhci_am654 to not clear HIGH_SPEED_ENA (HOST_CONTROL[2]) bit for SDR12 and SDR25 speed modes. Fixes: e374e87538f4 ("mmc: sdhci_am654: Clear HISPD_ENA in some lower speed modes") Signed-off-by: Bhavya Kapoor Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20230317092711.660897-1-b-kapoor@ti.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci_am654.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/mmc/host/sdhci_am654.c b/drivers/mmc/host/sdhci_am654.c index 89953093e20c..672d37ea98d0 100644 --- a/drivers/mmc/host/sdhci_am654.c +++ b/drivers/mmc/host/sdhci_am654.c @@ -351,8 +351,6 @@ static void sdhci_am654_write_b(struct sdhci_host *host, u8 val, int reg) */ case MMC_TIMING_SD_HS: case MMC_TIMING_MMC_HS: - case MMC_TIMING_UHS_SDR12: - case MMC_TIMING_UHS_SDR25: val &= ~SDHCI_CTRL_HISPD; } } From a246c20c45a0a2bf5e865a4c3a76822b79b38c80 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 22 Mar 2023 20:30:48 +0100 Subject: [PATCH 023/175] arm64: dts: qcom: sdm850-lenovo-yoga-c630: Use proper WSA881x shutdown GPIO polarity The WSA881x shutdown GPIO is active low (SD_N), but Linux driver assumed DTS always comes with active high. Since Linux drivers were updated to handle proper flag, correct the DTS. The change is not backwards compatible with older Linux kernel. Signed-off-by: Krzysztof Kozlowski Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/20230322193051.826167-2-krzysztof.kozlowski@linaro.org --- arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts b/arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts index 67d2a663ce75..5c688cb6a7ce 100644 --- a/arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts +++ b/arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts @@ -753,7 +753,7 @@ left_spkr: speaker@0,3 { compatible = "sdw10217211000"; reg = <0 3>; - powerdown-gpios = <&wcdgpio 1 GPIO_ACTIVE_HIGH>; + powerdown-gpios = <&wcdgpio 1 GPIO_ACTIVE_LOW>; #thermal-sensor-cells = <0>; sound-name-prefix = "SpkrLeft"; #sound-dai-cells = <0>; @@ -761,7 +761,7 @@ right_spkr: speaker@0,4 { compatible = "sdw10217211000"; - powerdown-gpios = <&wcdgpio 2 GPIO_ACTIVE_HIGH>; + powerdown-gpios = <&wcdgpio 2 GPIO_ACTIVE_LOW>; reg = <0 4>; #thermal-sensor-cells = <0>; sound-name-prefix = "SpkrRight"; From 5b91fab8eae27d1436eacde60107bab9987bbd9d Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 22 Mar 2023 20:30:49 +0100 Subject: [PATCH 024/175] arm64: dts: qcom: sdm850-samsung-w737: Use proper WSA881x shutdown GPIO polarity The WSA881x shutdown GPIO is active low (SD_N), but Linux driver assumed DTS always comes with active high. Since Linux drivers were updated to handle proper flag, correct the DTS. The change is not backwards compatible with older Linux kernel. Signed-off-by: Krzysztof Kozlowski Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/20230322193051.826167-3-krzysztof.kozlowski@linaro.org --- arch/arm64/boot/dts/qcom/sdm850-samsung-w737.dts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/qcom/sdm850-samsung-w737.dts b/arch/arm64/boot/dts/qcom/sdm850-samsung-w737.dts index 9850140514ba..41f59e32af64 100644 --- a/arch/arm64/boot/dts/qcom/sdm850-samsung-w737.dts +++ b/arch/arm64/boot/dts/qcom/sdm850-samsung-w737.dts @@ -662,7 +662,7 @@ left_spkr: speaker@0,3 { compatible = "sdw10217211000"; reg = <0 3>; - powerdown-gpios = <&wcdgpio 1 GPIO_ACTIVE_HIGH>; + powerdown-gpios = <&wcdgpio 1 GPIO_ACTIVE_LOW>; #thermal-sensor-cells = <0>; sound-name-prefix = "SpkrLeft"; #sound-dai-cells = <0>; @@ -670,7 +670,7 @@ right_spkr: speaker@0,4 { compatible = "sdw10217211000"; - powerdown-gpios = <&wcdgpio 2 GPIO_ACTIVE_HIGH>; + powerdown-gpios = <&wcdgpio 2 GPIO_ACTIVE_LOW>; reg = <0 4>; #thermal-sensor-cells = <0>; sound-name-prefix = "SpkrRight"; From 4ded91530544afdbac350f609e6597076f569e52 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 22 Mar 2023 20:30:50 +0100 Subject: [PATCH 025/175] arm64: dts: qcom: sm8250-mtp: Use proper WSA881x shutdown GPIO polarity The WSA881x shutdown GPIO is active low (SD_N), but Linux driver assumed DTS always comes with active high. Since Linux drivers were updated to handle proper flag, correct the DTS. The change is not backwards compatible with older Linux kernel. Signed-off-by: Krzysztof Kozlowski Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/20230322193051.826167-4-krzysztof.kozlowski@linaro.org --- arch/arm64/boot/dts/qcom/sm8250-mtp.dts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/qcom/sm8250-mtp.dts b/arch/arm64/boot/dts/qcom/sm8250-mtp.dts index e54cdc8bc31f..4c9de236676d 100644 --- a/arch/arm64/boot/dts/qcom/sm8250-mtp.dts +++ b/arch/arm64/boot/dts/qcom/sm8250-mtp.dts @@ -764,7 +764,7 @@ left_spkr: speaker@0,3 { compatible = "sdw10217211000"; reg = <0 3>; - powerdown-gpios = <&tlmm 26 GPIO_ACTIVE_HIGH>; + powerdown-gpios = <&tlmm 26 GPIO_ACTIVE_LOW>; #thermal-sensor-cells = <0>; sound-name-prefix = "SpkrLeft"; #sound-dai-cells = <0>; @@ -773,7 +773,7 @@ right_spkr: speaker@0,4 { compatible = "sdw10217211000"; reg = <0 4>; - powerdown-gpios = <&tlmm 127 GPIO_ACTIVE_HIGH>; + powerdown-gpios = <&tlmm 127 GPIO_ACTIVE_LOW>; #thermal-sensor-cells = <0>; sound-name-prefix = "SpkrRight"; #sound-dai-cells = <0>; From 41841f120345be87a12a4096ebcc2d2959c484ef Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 22 Mar 2023 20:30:51 +0100 Subject: [PATCH 026/175] arm64: dts: qcom: qrb5165-rb5: Use proper WSA881x shutdown GPIO polarity The WSA881x shutdown GPIO is active low (SD_N), but Linux driver assumed DTS always comes with active high. Since Linux drivers were updated to handle proper flag, correct the DTS. The change is not backwards compatible with older Linux kernel. Signed-off-by: Krzysztof Kozlowski Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/20230322193051.826167-5-krzysztof.kozlowski@linaro.org --- arch/arm64/boot/dts/qcom/qrb5165-rb5.dts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/qcom/qrb5165-rb5.dts b/arch/arm64/boot/dts/qcom/qrb5165-rb5.dts index aa0a7bd7307c..dd924331b0ee 100644 --- a/arch/arm64/boot/dts/qcom/qrb5165-rb5.dts +++ b/arch/arm64/boot/dts/qcom/qrb5165-rb5.dts @@ -1012,7 +1012,7 @@ left_spkr: speaker@0,3 { compatible = "sdw10217211000"; reg = <0 3>; - powerdown-gpios = <&tlmm 130 GPIO_ACTIVE_HIGH>; + powerdown-gpios = <&tlmm 130 GPIO_ACTIVE_LOW>; #thermal-sensor-cells = <0>; sound-name-prefix = "SpkrLeft"; #sound-dai-cells = <0>; @@ -1021,7 +1021,7 @@ right_spkr: speaker@0,4 { compatible = "sdw10217211000"; reg = <0 4>; - powerdown-gpios = <&tlmm 130 GPIO_ACTIVE_HIGH>; + powerdown-gpios = <&tlmm 130 GPIO_ACTIVE_LOW>; #thermal-sensor-cells = <0>; sound-name-prefix = "SpkrRight"; #sound-dai-cells = <0>; From 72630ba422b70ea0874fc90d526353cf71c72488 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Fri, 24 Mar 2023 05:16:50 +0300 Subject: [PATCH 027/175] arm64: dts: qcom: ipq8074-hk01: enable QMP device, not the PHY node Correct PCIe PHY enablement to refer the QMP device nodes rather than PHY device nodes. QMP nodes have 'status = "disabled"' property in the ipq8074.dtsi, while PHY nodes do not correspond to the actual device and do not have the status property. Fixes: e8a7fdc505bb ("arm64: dts: ipq8074: qcom: Re-arrange dts nodes based on address") Signed-off-by: Dmitry Baryshkov Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/20230324021651.1799969-1-dmitry.baryshkov@linaro.org --- arch/arm64/boot/dts/qcom/ipq8074-hk01.dts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/qcom/ipq8074-hk01.dts b/arch/arm64/boot/dts/qcom/ipq8074-hk01.dts index ca3f96646b90..5cf07caf4103 100644 --- a/arch/arm64/boot/dts/qcom/ipq8074-hk01.dts +++ b/arch/arm64/boot/dts/qcom/ipq8074-hk01.dts @@ -62,11 +62,11 @@ perst-gpios = <&tlmm 58 GPIO_ACTIVE_LOW>; }; -&pcie_phy0 { +&pcie_qmp0 { status = "okay"; }; -&pcie_phy1 { +&pcie_qmp1 { status = "okay"; }; From 1dc40551f206d20b7e46ea7dd538dcdd928451c6 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Fri, 24 Mar 2023 05:16:51 +0300 Subject: [PATCH 028/175] arm64: dts: qcom: ipq8074-hk10: enable QMP device, not the PHY node Correct PCIe PHY enablement to refer the QMP device nodes rather than PHY device nodes. QMP nodes have 'status = "disabled"' property in the ipq8074.dtsi, while PHY nodes do not correspond to the actual device and do not have the status property. Fixes: 1ed34da63a37 ("arm64: dts: qcom: Add board support for HK10") Signed-off-by: Dmitry Baryshkov Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/20230324021651.1799969-2-dmitry.baryshkov@linaro.org --- arch/arm64/boot/dts/qcom/ipq8074-hk10.dtsi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/qcom/ipq8074-hk10.dtsi b/arch/arm64/boot/dts/qcom/ipq8074-hk10.dtsi index 651a231554e0..1b8379ba87f9 100644 --- a/arch/arm64/boot/dts/qcom/ipq8074-hk10.dtsi +++ b/arch/arm64/boot/dts/qcom/ipq8074-hk10.dtsi @@ -48,11 +48,11 @@ perst-gpios = <&tlmm 61 GPIO_ACTIVE_LOW>; }; -&pcie_phy0 { +&pcie_qmp0 { status = "okay"; }; -&pcie_phy1 { +&pcie_qmp1 { status = "okay"; }; From 8056dc043d7f74d7675d413cb3dc4fa290609922 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sun, 26 Mar 2023 18:47:51 +0200 Subject: [PATCH 029/175] riscv: dts: canaan: drop invalid spi-max-frequency The spi-max-frequency is a property of SPI children, not the controller: k210_generic.dtb: spi@50240000: Unevaluated properties are not allowed ('spi-max-frequency' was unexpected) Signed-off-by: Krzysztof Kozlowski Signed-off-by: Conor Dooley --- arch/riscv/boot/dts/canaan/k210.dtsi | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/riscv/boot/dts/canaan/k210.dtsi b/arch/riscv/boot/dts/canaan/k210.dtsi index 07e2e2649604..f87c5164d9cf 100644 --- a/arch/riscv/boot/dts/canaan/k210.dtsi +++ b/arch/riscv/boot/dts/canaan/k210.dtsi @@ -259,7 +259,6 @@ <&sysclk K210_CLK_APB0>; clock-names = "ssi_clk", "pclk"; resets = <&sysrst K210_RST_SPI2>; - spi-max-frequency = <25000000>; }; i2s0: i2s@50250000 { From c355d913d21736c8a00a41cfacc19d2839063d89 Mon Sep 17 00:00:00 2001 From: Alexander Stein Date: Mon, 27 Mar 2023 10:06:00 +0800 Subject: [PATCH 030/175] arm64: dts: imx8mp: fix address length for LCDIF2 0x238 is the offset for PANIC0_THRES, so the length needs to be greater than that. Use the size from memory map from reference manual. Fixes: 94e6197dadc9 ("arm64: dts: imx8mp: Add LCDIF2 & LDB nodes") Signed-off-by: Alexander Stein Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8mp.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/freescale/imx8mp.dtsi b/arch/arm64/boot/dts/freescale/imx8mp.dtsi index 2dd60e3252f3..a237275ee017 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mp.dtsi @@ -1128,7 +1128,7 @@ lcdif2: display-controller@32e90000 { compatible = "fsl,imx8mp-lcdif"; - reg = <0x32e90000 0x238>; + reg = <0x32e90000 0x10000>; interrupts = ; clocks = <&clk IMX8MP_CLK_MEDIA_DISP2_PIX_ROOT>, <&clk IMX8MP_CLK_MEDIA_APB_ROOT>, From aec4353114a408b3a831a22ba34942d05943e462 Mon Sep 17 00:00:00 2001 From: Marc Gonzalez Date: Mon, 27 Mar 2023 14:09:30 +0200 Subject: [PATCH 031/175] arm64: dts: meson-g12-common: specify full DMC range According to S905X2 Datasheet - Revision 07: DRAM Memory Controller (DMC) register area spans ff638000-ff63a000. According to DeviceTree Specification - Release v0.4-rc1: simple-bus nodes do not require reg property. Fixes: 1499218c80c99a ("arm64: dts: move common G12A & G12B modes to meson-g12-common.dtsi") Signed-off-by: Marc Gonzalez Reviewed-by: Martin Blumenstingl Link: https://lore.kernel.org/r/20230327120932.2158389-2-mgonzalez@freebox.fr Signed-off-by: Neil Armstrong --- arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi b/arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi index 123a56f7f818..5fe099a2311e 100644 --- a/arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi +++ b/arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi @@ -1571,10 +1571,9 @@ dmc: bus@38000 { compatible = "simple-bus"; - reg = <0x0 0x38000 0x0 0x400>; #address-cells = <2>; #size-cells = <2>; - ranges = <0x0 0x0 0x0 0x38000 0x0 0x400>; + ranges = <0x0 0x0 0x0 0x38000 0x0 0x2000>; canvas: video-lut@48 { compatible = "amlogic,canvas"; From 33acea2049b5058b93d1dabb536b494f543f02a2 Mon Sep 17 00:00:00 2001 From: Marc Gonzalez Date: Mon, 27 Mar 2023 14:09:31 +0200 Subject: [PATCH 032/175] arm64: dts: meson-g12-common: resolve conflict between canvas & pmu According to S905X2 Datasheet - Revision 07: DMC_MON area spans 0xff638080-0xff6380c0 DDR_PLL area spans 0xff638c00-0xff638c34 Round DDR_PLL area size up to 0x40 Fixes: 90cf8e21016fa3 ("arm64: dts: meson: Add DDR PMU node") Signed-off-by: Marc Gonzalez Reviewed-by: Neil Armstrong Link: https://lore.kernel.org/r/20230327120932.2158389-3-mgonzalez@freebox.fr Signed-off-by: Neil Armstrong --- arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi b/arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi index 5fe099a2311e..feb27a0ccfb4 100644 --- a/arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi +++ b/arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi @@ -1579,6 +1579,12 @@ compatible = "amlogic,canvas"; reg = <0x0 0x48 0x0 0x14>; }; + + pmu: pmu@80 { + reg = <0x0 0x80 0x0 0x40>, + <0x0 0xc00 0x0 0x40>; + interrupts = ; + }; }; usb2_phy1: phy@3a000 { @@ -1704,12 +1710,6 @@ }; }; - pmu: pmu@ff638000 { - reg = <0x0 0xff638000 0x0 0x100>, - <0x0 0xff638c00 0x0 0x100>; - interrupts = ; - }; - aobus: bus@ff800000 { compatible = "simple-bus"; reg = <0x0 0xff800000 0x0 0x100000>; From f9d323e7c1724270d747657051099826744e91e7 Mon Sep 17 00:00:00 2001 From: Marc Gonzalez Date: Mon, 27 Mar 2023 14:09:32 +0200 Subject: [PATCH 033/175] perf/amlogic: adjust register offsets Commit "perf/amlogic: resolve conflict between canvas & pmu" changed the base address. Fixes: 2016e2113d35 ("perf/amlogic: Add support for Amlogic meson G12 SoC DDR PMU driver") Signed-off-by: Marc Gonzalez Acked-by: Will Deacon Reviewed-by: Neil Armstrong Link: https://lore.kernel.org/r/20230327120932.2158389-4-mgonzalez@freebox.fr Signed-off-by: Neil Armstrong --- drivers/perf/amlogic/meson_g12_ddr_pmu.c | 32 ++++++++++++------------ 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/drivers/perf/amlogic/meson_g12_ddr_pmu.c b/drivers/perf/amlogic/meson_g12_ddr_pmu.c index a78fdb15e26c..8b643888d503 100644 --- a/drivers/perf/amlogic/meson_g12_ddr_pmu.c +++ b/drivers/perf/amlogic/meson_g12_ddr_pmu.c @@ -21,23 +21,23 @@ #define DMC_QOS_IRQ BIT(30) /* DMC bandwidth monitor register address offset */ -#define DMC_MON_G12_CTRL0 (0x20 << 2) -#define DMC_MON_G12_CTRL1 (0x21 << 2) -#define DMC_MON_G12_CTRL2 (0x22 << 2) -#define DMC_MON_G12_CTRL3 (0x23 << 2) -#define DMC_MON_G12_CTRL4 (0x24 << 2) -#define DMC_MON_G12_CTRL5 (0x25 << 2) -#define DMC_MON_G12_CTRL6 (0x26 << 2) -#define DMC_MON_G12_CTRL7 (0x27 << 2) -#define DMC_MON_G12_CTRL8 (0x28 << 2) +#define DMC_MON_G12_CTRL0 (0x0 << 2) +#define DMC_MON_G12_CTRL1 (0x1 << 2) +#define DMC_MON_G12_CTRL2 (0x2 << 2) +#define DMC_MON_G12_CTRL3 (0x3 << 2) +#define DMC_MON_G12_CTRL4 (0x4 << 2) +#define DMC_MON_G12_CTRL5 (0x5 << 2) +#define DMC_MON_G12_CTRL6 (0x6 << 2) +#define DMC_MON_G12_CTRL7 (0x7 << 2) +#define DMC_MON_G12_CTRL8 (0x8 << 2) -#define DMC_MON_G12_ALL_REQ_CNT (0x29 << 2) -#define DMC_MON_G12_ALL_GRANT_CNT (0x2a << 2) -#define DMC_MON_G12_ONE_GRANT_CNT (0x2b << 2) -#define DMC_MON_G12_SEC_GRANT_CNT (0x2c << 2) -#define DMC_MON_G12_THD_GRANT_CNT (0x2d << 2) -#define DMC_MON_G12_FOR_GRANT_CNT (0x2e << 2) -#define DMC_MON_G12_TIMER (0x2f << 2) +#define DMC_MON_G12_ALL_REQ_CNT (0x9 << 2) +#define DMC_MON_G12_ALL_GRANT_CNT (0xa << 2) +#define DMC_MON_G12_ONE_GRANT_CNT (0xb << 2) +#define DMC_MON_G12_SEC_GRANT_CNT (0xc << 2) +#define DMC_MON_G12_THD_GRANT_CNT (0xd << 2) +#define DMC_MON_G12_FOR_GRANT_CNT (0xe << 2) +#define DMC_MON_G12_TIMER (0xf << 2) /* Each bit represent a axi line */ PMU_FORMAT_ATTR(event, "config:0-7"); From 292fd843de26c551856e66faf134512c52dd78b4 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 17 Mar 2023 11:15:05 -0400 Subject: [PATCH 034/175] cgroup/cpuset: Fix partition root's cpuset.cpus update bug It was found that commit 7a2127e66a00 ("cpuset: Call set_cpus_allowed_ptr() with appropriate mask for task") introduced a bug that corrupted "cpuset.cpus" of a partition root when it was updated. It is because the tmp->new_cpus field of the passed tmp parameter of update_parent_subparts_cpumask() should not be used at all as it contains important cpumask data that should not be overwritten. Fix it by using tmp->addmask instead. Also update update_cpumask() to make sure that trialcs->cpu_allowed will not be corrupted until it is no longer needed. Fixes: 7a2127e66a00 ("cpuset: Call set_cpus_allowed_ptr() with appropriate mask for task") Signed-off-by: Waiman Long Cc: stable@vger.kernel.org # v6.2+ Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 636f1c682ac0..f310915d1257 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1513,7 +1513,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, spin_unlock_irq(&callback_lock); if (adding || deleting) - update_tasks_cpumask(parent, tmp->new_cpus); + update_tasks_cpumask(parent, tmp->addmask); /* * Set or clear CS_SCHED_LOAD_BALANCE when partcmd_update, if necessary. @@ -1770,10 +1770,13 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, /* * Use the cpumasks in trialcs for tmpmasks when they are pointers * to allocated cpumasks. + * + * Note that update_parent_subparts_cpumask() uses only addmask & + * delmask, but not new_cpus. */ tmp.addmask = trialcs->subparts_cpus; tmp.delmask = trialcs->effective_cpus; - tmp.new_cpus = trialcs->cpus_allowed; + tmp.new_cpus = NULL; #endif retval = validate_change(cs, trialcs); @@ -1838,6 +1841,11 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, } spin_unlock_irq(&callback_lock); +#ifdef CONFIG_CPUMASK_OFFSTACK + /* Now trialcs->cpus_allowed is available */ + tmp.new_cpus = trialcs->cpus_allowed; +#endif + /* effective_cpus will be updated here */ update_cpumasks_hier(cs, &tmp, false); From aa874cdfec07d4dd9c6f0c356d65c609ba31a26f Mon Sep 17 00:00:00 2001 From: Tharun Kumar P Date: Mon, 20 Mar 2023 19:52:37 +0530 Subject: [PATCH 035/175] i2c: mchp-pci1xxxx: Update Timing registers Update I2C timing registers based on latest hardware design. This fix does not break functionality of chips with older design and existing users will not be affected. Fixes: 361693697249 ("i2c: microchip: pci1xxxx: Add driver for I2C host controller in multifunction endpoint of pci1xxxx switch") Signed-off-by: Tharun Kumar P Reviewed-by: Andy Shevchenko Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-mchp-pci1xxxx.c | 60 +++++++++++++------------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/drivers/i2c/busses/i2c-mchp-pci1xxxx.c b/drivers/i2c/busses/i2c-mchp-pci1xxxx.c index 09af75921147..b21ffd6df927 100644 --- a/drivers/i2c/busses/i2c-mchp-pci1xxxx.c +++ b/drivers/i2c/busses/i2c-mchp-pci1xxxx.c @@ -48,9 +48,9 @@ * SR_HOLD_TIME_XK_TICKS field will indicate the number of ticks of the * baud clock required to program 'Hold Time' at X KHz. */ -#define SR_HOLD_TIME_100K_TICKS 133 -#define SR_HOLD_TIME_400K_TICKS 20 -#define SR_HOLD_TIME_1000K_TICKS 11 +#define SR_HOLD_TIME_100K_TICKS 150 +#define SR_HOLD_TIME_400K_TICKS 20 +#define SR_HOLD_TIME_1000K_TICKS 12 #define SMB_CORE_COMPLETION_REG_OFF3 (SMBUS_MAST_CORE_ADDR_BASE + 0x23) @@ -65,17 +65,17 @@ * the baud clock required to program 'fair idle delay' at X KHz. Fair idle * delay establishes the MCTP T(IDLE_DELAY) period. */ -#define FAIR_BUS_IDLE_MIN_100K_TICKS 969 -#define FAIR_BUS_IDLE_MIN_400K_TICKS 157 -#define FAIR_BUS_IDLE_MIN_1000K_TICKS 157 +#define FAIR_BUS_IDLE_MIN_100K_TICKS 992 +#define FAIR_BUS_IDLE_MIN_400K_TICKS 500 +#define FAIR_BUS_IDLE_MIN_1000K_TICKS 500 /* * FAIR_IDLE_DELAY_XK_TICKS field will indicate the number of ticks of the * baud clock required to satisfy the fairness protocol at X KHz. */ -#define FAIR_IDLE_DELAY_100K_TICKS 1000 -#define FAIR_IDLE_DELAY_400K_TICKS 500 -#define FAIR_IDLE_DELAY_1000K_TICKS 500 +#define FAIR_IDLE_DELAY_100K_TICKS 963 +#define FAIR_IDLE_DELAY_400K_TICKS 156 +#define FAIR_IDLE_DELAY_1000K_TICKS 156 #define SMB_IDLE_SCALING_100K \ ((FAIR_IDLE_DELAY_100K_TICKS << 16) | FAIR_BUS_IDLE_MIN_100K_TICKS) @@ -105,7 +105,7 @@ */ #define BUS_CLK_100K_LOW_PERIOD_TICKS 156 #define BUS_CLK_400K_LOW_PERIOD_TICKS 41 -#define BUS_CLK_1000K_LOW_PERIOD_TICKS 15 +#define BUS_CLK_1000K_LOW_PERIOD_TICKS 15 /* * BUS_CLK_XK_HIGH_PERIOD_TICKS field defines the number of I2C Baud Clock @@ -131,7 +131,7 @@ */ #define CLK_SYNC_100K 4 #define CLK_SYNC_400K 4 -#define CLK_SYNC_1000K 4 +#define CLK_SYNC_1000K 4 #define SMB_CORE_DATA_TIMING_REG_OFF (SMBUS_MAST_CORE_ADDR_BASE + 0x40) @@ -142,25 +142,25 @@ * determines the SCLK hold time following SDAT driven low during the first * START bit in a transfer. */ -#define FIRST_START_HOLD_100K_TICKS 22 -#define FIRST_START_HOLD_400K_TICKS 16 -#define FIRST_START_HOLD_1000K_TICKS 6 +#define FIRST_START_HOLD_100K_TICKS 23 +#define FIRST_START_HOLD_400K_TICKS 8 +#define FIRST_START_HOLD_1000K_TICKS 12 /* * STOP_SETUP_XK_TICKS will indicate the number of ticks of the baud clock * required to program 'STOP_SETUP' timer at X KHz. This timer determines the * SDAT setup time from the rising edge of SCLK for a STOP condition. */ -#define STOP_SETUP_100K_TICKS 157 +#define STOP_SETUP_100K_TICKS 150 #define STOP_SETUP_400K_TICKS 20 -#define STOP_SETUP_1000K_TICKS 12 +#define STOP_SETUP_1000K_TICKS 12 /* * RESTART_SETUP_XK_TICKS will indicate the number of ticks of the baud clock * required to program 'RESTART_SETUP' timer at X KHz. This timer determines the * SDAT setup time from the rising edge of SCLK for a repeated START condition. */ -#define RESTART_SETUP_100K_TICKS 157 +#define RESTART_SETUP_100K_TICKS 156 #define RESTART_SETUP_400K_TICKS 20 #define RESTART_SETUP_1000K_TICKS 12 @@ -169,7 +169,7 @@ * required to program 'DATA_HOLD' timer at X KHz. This timer determines the * SDAT hold time following SCLK driven low. */ -#define DATA_HOLD_100K_TICKS 2 +#define DATA_HOLD_100K_TICKS 12 #define DATA_HOLD_400K_TICKS 2 #define DATA_HOLD_1000K_TICKS 2 @@ -190,35 +190,35 @@ * Bus Idle Minimum time = BUS_IDLE_MIN[7:0] x Baud_Clock_Period x * (BUS_IDLE_MIN_XK_TICKS[7] ? 4,1) */ -#define BUS_IDLE_MIN_100K_TICKS 167UL -#define BUS_IDLE_MIN_400K_TICKS 139UL -#define BUS_IDLE_MIN_1000K_TICKS 133UL +#define BUS_IDLE_MIN_100K_TICKS 36UL +#define BUS_IDLE_MIN_400K_TICKS 10UL +#define BUS_IDLE_MIN_1000K_TICKS 4UL /* * CTRL_CUM_TIME_OUT_XK_TICKS defines SMBus Controller Cumulative Time-Out. * SMBus Controller Cumulative Time-Out duration = * CTRL_CUM_TIME_OUT_XK_TICKS[7:0] x Baud_Clock_Period x 2048 */ -#define CTRL_CUM_TIME_OUT_100K_TICKS 159 -#define CTRL_CUM_TIME_OUT_400K_TICKS 159 -#define CTRL_CUM_TIME_OUT_1000K_TICKS 159 +#define CTRL_CUM_TIME_OUT_100K_TICKS 76 +#define CTRL_CUM_TIME_OUT_400K_TICKS 76 +#define CTRL_CUM_TIME_OUT_1000K_TICKS 76 /* * TARGET_CUM_TIME_OUT_XK_TICKS defines SMBus Target Cumulative Time-Out duration. * SMBus Target Cumulative Time-Out duration = TARGET_CUM_TIME_OUT_XK_TICKS[7:0] x * Baud_Clock_Period x 4096 */ -#define TARGET_CUM_TIME_OUT_100K_TICKS 199 -#define TARGET_CUM_TIME_OUT_400K_TICKS 199 -#define TARGET_CUM_TIME_OUT_1000K_TICKS 199 +#define TARGET_CUM_TIME_OUT_100K_TICKS 95 +#define TARGET_CUM_TIME_OUT_400K_TICKS 95 +#define TARGET_CUM_TIME_OUT_1000K_TICKS 95 /* * CLOCK_HIGH_TIME_OUT_XK defines Clock High time out period. * Clock High time out period = CLOCK_HIGH_TIME_OUT_XK[7:0] x Baud_Clock_Period x 8 */ -#define CLOCK_HIGH_TIME_OUT_100K_TICKS 204 -#define CLOCK_HIGH_TIME_OUT_400K_TICKS 204 -#define CLOCK_HIGH_TIME_OUT_1000K_TICKS 204 +#define CLOCK_HIGH_TIME_OUT_100K_TICKS 97 +#define CLOCK_HIGH_TIME_OUT_400K_TICKS 97 +#define CLOCK_HIGH_TIME_OUT_1000K_TICKS 97 #define TO_SCALING_100K \ ((BUS_IDLE_MIN_100K_TICKS << 24) | (CTRL_CUM_TIME_OUT_100K_TICKS << 16) | \ From 1e020e1b96afdecd20680b5b5be2a6ffc3d27628 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Mon, 6 Mar 2023 09:33:08 +0800 Subject: [PATCH 036/175] ubi: Fix failure attaching when vid_hdr offset equals to (sub)page size Following process will make ubi attaching failed since commit 1b42b1a36fc946 ("ubi: ensure that VID header offset ... size"): ID="0xec,0xa1,0x00,0x15" # 128M 128KB 2KB modprobe nandsim id_bytes=$ID flash_eraseall /dev/mtd0 modprobe ubi mtd="0,2048" # set vid_hdr offset as 2048 (one page) (dmesg): ubi0 error: ubi_attach_mtd_dev [ubi]: VID header offset 2048 too large. UBI error: cannot attach mtd0 UBI error: cannot initialize UBI, error -22 Rework original solution, the key point is making sure 'vid_hdr_shift + UBI_VID_HDR_SIZE < ubi->vid_hdr_alsize', so we should check vid_hdr_shift rather not vid_hdr_offset. Then, ubi still support (sub)page aligined VID header offset. Fixes: 1b42b1a36fc946 ("ubi: ensure that VID header offset ... size") Signed-off-by: Zhihao Cheng Tested-by: Nicolas Schichan Tested-by: Miquel Raynal # v5.10, v4.19 Signed-off-by: Richard Weinberger --- drivers/mtd/ubi/build.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/drivers/mtd/ubi/build.c b/drivers/mtd/ubi/build.c index 0904eb40c95f..ad025b2ee417 100644 --- a/drivers/mtd/ubi/build.c +++ b/drivers/mtd/ubi/build.c @@ -666,12 +666,6 @@ static int io_init(struct ubi_device *ubi, int max_beb_per1024) ubi->ec_hdr_alsize = ALIGN(UBI_EC_HDR_SIZE, ubi->hdrs_min_io_size); ubi->vid_hdr_alsize = ALIGN(UBI_VID_HDR_SIZE, ubi->hdrs_min_io_size); - if (ubi->vid_hdr_offset && ((ubi->vid_hdr_offset + UBI_VID_HDR_SIZE) > - ubi->vid_hdr_alsize)) { - ubi_err(ubi, "VID header offset %d too large.", ubi->vid_hdr_offset); - return -EINVAL; - } - dbg_gen("min_io_size %d", ubi->min_io_size); dbg_gen("max_write_size %d", ubi->max_write_size); dbg_gen("hdrs_min_io_size %d", ubi->hdrs_min_io_size); @@ -689,6 +683,21 @@ static int io_init(struct ubi_device *ubi, int max_beb_per1024) ubi->vid_hdr_aloffset; } + /* + * Memory allocation for VID header is ubi->vid_hdr_alsize + * which is described in comments in io.c. + * Make sure VID header shift + UBI_VID_HDR_SIZE not exceeds + * ubi->vid_hdr_alsize, so that all vid header operations + * won't access memory out of bounds. + */ + if ((ubi->vid_hdr_shift + UBI_VID_HDR_SIZE) > ubi->vid_hdr_alsize) { + ubi_err(ubi, "Invalid VID header offset %d, VID header shift(%d)" + " + VID header size(%zu) > VID header aligned size(%d).", + ubi->vid_hdr_offset, ubi->vid_hdr_shift, + UBI_VID_HDR_SIZE, ubi->vid_hdr_alsize); + return -EINVAL; + } + /* Similar for the data offset */ ubi->leb_start = ubi->vid_hdr_offset + UBI_VID_HDR_SIZE; ubi->leb_start = ALIGN(ubi->leb_start, ubi->min_io_size); From 8671133082176d1388e20ac33d61cf7e3b05adf5 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Fri, 24 Mar 2023 11:15:31 +0100 Subject: [PATCH 037/175] tee: Pass a pointer to virt_to_page() Like the other calls in this function virt_to_page() expects a pointer, not an integer. However since many architectures implement virt_to_pfn() as a macro, this function becomes polymorphic and accepts both a (unsigned long) and a (void *). Fix this up with an explicit cast. Signed-off-by: Linus Walleij Signed-off-by: Jens Wiklander --- drivers/tee/tee_shm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tee/tee_shm.c b/drivers/tee/tee_shm.c index b1c6231defad..673cf0359494 100644 --- a/drivers/tee/tee_shm.c +++ b/drivers/tee/tee_shm.c @@ -32,7 +32,7 @@ static int shm_get_kernel_pages(unsigned long start, size_t page_count, is_kmap_addr((void *)start))) return -EINVAL; - page = virt_to_page(start); + page = virt_to_page((void *)start); for (n = 0; n < page_count; n++) { pages[n] = page + n; get_page(pages[n]); From 87891399d9883ed823ba58c2be3ac20cc499ad7d Mon Sep 17 00:00:00 2001 From: Chris Morgan Date: Mon, 27 Mar 2023 10:35:47 -0500 Subject: [PATCH 038/175] arm64: dts: rockchip: Add clk_rtc_32k to Anbernic xx3 Devices For the Anbernic devices to display properly, we need to specify the clock frequency of the PLL_VPLL. Adding the parent clock in the rk356x.dtsi requires us to update our clock definitions to accomplish this. Fixes: 64b69474edf3 ("arm64: dts: rockchip: assign rate to clk_rtc_32k on rk356x") Signed-off-by: Chris Morgan Link: https://lore.kernel.org/r/20230327153547.821822-1-macroalpha82@gmail.com Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3566-anbernic-rg353x.dtsi | 6 ++++-- arch/arm64/boot/dts/rockchip/rk3566-anbernic-rg503.dts | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3566-anbernic-rg353x.dtsi b/arch/arm64/boot/dts/rockchip/rk3566-anbernic-rg353x.dtsi index 65a80d1f6d91..9a0e217f069f 100644 --- a/arch/arm64/boot/dts/rockchip/rk3566-anbernic-rg353x.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3566-anbernic-rg353x.dtsi @@ -16,8 +16,10 @@ }; &cru { - assigned-clocks = <&cru PLL_GPLL>, <&pmucru PLL_PPLL>, <&cru PLL_VPLL>; - assigned-clock-rates = <1200000000>, <200000000>, <241500000>; + assigned-clocks = <&pmucru CLK_RTC_32K>, <&cru PLL_GPLL>, + <&pmucru PLL_PPLL>, <&cru PLL_VPLL>; + assigned-clock-rates = <32768>, <1200000000>, + <200000000>, <241500000>; }; &gpio_keys_control { diff --git a/arch/arm64/boot/dts/rockchip/rk3566-anbernic-rg503.dts b/arch/arm64/boot/dts/rockchip/rk3566-anbernic-rg503.dts index b4b2df821cba..c763c7f3b1b3 100644 --- a/arch/arm64/boot/dts/rockchip/rk3566-anbernic-rg503.dts +++ b/arch/arm64/boot/dts/rockchip/rk3566-anbernic-rg503.dts @@ -105,8 +105,10 @@ }; &cru { - assigned-clocks = <&cru PLL_GPLL>, <&pmucru PLL_PPLL>, <&cru PLL_VPLL>; - assigned-clock-rates = <1200000000>, <200000000>, <500000000>; + assigned-clocks = <&pmucru CLK_RTC_32K>, <&cru PLL_GPLL>, + <&pmucru PLL_PPLL>, <&cru PLL_VPLL>; + assigned-clock-rates = <32768>, <1200000000>, + <200000000>, <500000000>; }; &dsi_dphy0 { From 3adf89324a2b2a9dbc2c12d8895021e7e34e3346 Mon Sep 17 00:00:00 2001 From: Javier Martinez Canillas Date: Fri, 31 Mar 2023 01:19:23 +0200 Subject: [PATCH 039/175] arm64: dts: rockchip: Remove non-existing pwm-delay-us property There is neither a driver that parses this nor a DT binding schema that documents it, so let's remove from the DTS files that make use of this. The properties that exist are post-pwm-on-delay-ms and pwm-off-delay-ms, defined in the pwm-backlight DT binding. If the delays are really needed then those properties should be used instead. Brian Norris mentioned though that looking at the first downstream usage of the pwm-delay-us property for RK3399 Gru systems in ChromiumOS tree, he couldn't find a spec reference that said that this was really needed. So perhaps it was unnecessary added and a simple removal would be enough. Signed-off-by: Javier Martinez Canillas Reviewed-by: Brian Norris Link: https://lore.kernel.org/r/20230330231924.2404747-1-javierm@redhat.com Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3368-evb.dtsi | 1 - arch/arm64/boot/dts/rockchip/rk3399-gru-chromebook.dtsi | 1 - arch/arm64/boot/dts/rockchip/rk3399-gru-scarlet.dtsi | 1 - 3 files changed, 3 deletions(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3368-evb.dtsi b/arch/arm64/boot/dts/rockchip/rk3368-evb.dtsi index 083452c67711..e47d1398aeca 100644 --- a/arch/arm64/boot/dts/rockchip/rk3368-evb.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3368-evb.dtsi @@ -61,7 +61,6 @@ pinctrl-names = "default"; pinctrl-0 = <&bl_en>; pwms = <&pwm0 0 1000000 PWM_POLARITY_INVERTED>; - pwm-delay-us = <10000>; }; emmc_pwrseq: emmc-pwrseq { diff --git a/arch/arm64/boot/dts/rockchip/rk3399-gru-chromebook.dtsi b/arch/arm64/boot/dts/rockchip/rk3399-gru-chromebook.dtsi index ee6095baba4d..5c1929d41cc0 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399-gru-chromebook.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3399-gru-chromebook.dtsi @@ -198,7 +198,6 @@ power-supply = <&pp3300_disp>; pinctrl-names = "default"; pinctrl-0 = <&bl_en>; - pwm-delay-us = <10000>; }; gpio_keys: gpio-keys { diff --git a/arch/arm64/boot/dts/rockchip/rk3399-gru-scarlet.dtsi b/arch/arm64/boot/dts/rockchip/rk3399-gru-scarlet.dtsi index a47d9f758611..c5e7de60c121 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399-gru-scarlet.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3399-gru-scarlet.dtsi @@ -167,7 +167,6 @@ pinctrl-names = "default"; pinctrl-0 = <&bl_en>; pwms = <&pwm1 0 1000000 0>; - pwm-delay-us = <10000>; }; dmic: dmic { From b277fc793daf258877b4c0744b52f69d6e6ba22e Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 4 Apr 2023 09:44:33 +0530 Subject: [PATCH 040/175] powerpc/papr_scm: Update the NUMA distance table for the target node Platform device helper routines won't update the NUMA distance table while creating a platform device, even if the device is present on a NUMA node that doesn't have memory or CPU. This is especially true for pmem devices. If the target node of the pmem device is not online, we find the nearest online node to the device and associate the pmem device with that online node. To find the nearest online node, we should have the numa distance table updated correctly. Update the distance information during the device probe. For a papr scm device on NUMA node 3 distance_lookup_table value for distance_ref_points_depth = 2 before and after fix is below: Before fix: node 3 distance depth 0 - 0 node 3 distance depth 1 - 0 node 4 distance depth 0 - 4 node 4 distance depth 1 - 2 node 5 distance depth 0 - 5 node 5 distance depth 1 - 1 After fix node 3 distance depth 0 - 3 node 3 distance depth 1 - 1 node 4 distance depth 0 - 4 node 4 distance depth 1 - 2 node 5 distance depth 0 - 5 node 5 distance depth 1 - 1 Without the fix, the nearest numa node to the pmem device (NUMA node 3) will be picked as 4. After the fix, we get the correct numa node which is 5. Fixes: da1115fdbd6e ("powerpc/nvdimm: Pick nearby online node if the device node is not online") Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman Link: https://msgid.link/20230404041433.1781804-1-aneesh.kumar@linux.ibm.com --- arch/powerpc/mm/numa.c | 1 + arch/powerpc/platforms/pseries/papr_scm.c | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index b44ce71917d7..16cfe56be05b 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -366,6 +366,7 @@ void update_numa_distance(struct device_node *node) WARN(numa_distance_table[nid][nid] == -1, "NUMA distance details for node %d not provided\n", nid); } +EXPORT_SYMBOL_GPL(update_numa_distance); /* * ibm,numa-lookup-index-table= {N, domainid1, domainid2, ..... domainidN} diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c index 2f8385523a13..1a53e048ceb7 100644 --- a/arch/powerpc/platforms/pseries/papr_scm.c +++ b/arch/powerpc/platforms/pseries/papr_scm.c @@ -1428,6 +1428,13 @@ static int papr_scm_probe(struct platform_device *pdev) return -ENODEV; } + /* + * open firmware platform device create won't update the NUMA + * distance table. For PAPR SCM devices we use numa_map_to_online_node() + * to find the nearest online NUMA node and that requires correct + * distance table information. + */ + update_numa_distance(dn); p = kzalloc(sizeof(*p), GFP_KERNEL); if (!p) From ad8cd35c58ca3ec5e93f52a0124899627b98efb2 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 27 Mar 2023 14:29:48 +0200 Subject: [PATCH 041/175] arm64: dts: qcom: sc8280xp-pmics: fix pon compatible and registers The pmk8280 PMIC PON peripheral is gen3 and uses two sets of registers; hlos and pbs. This specifically fixes the following error message during boot when the pbs registers are not defined: PON_PBS address missing, can't read HW debounce time Note that this also enables the spurious interrupt workaround introduced by commit 0b65118e6ba3 ("Input: pm8941-pwrkey - add software key press debouncing support") (which may or may not be needed). Fixes: ccd3517faf18 ("arm64: dts: qcom: sc8280xp: Add reference device") Signed-off-by: Johan Hovold Reviewed-by: Dmitry Baryshkov Tested-by: Steev Klimaszewski #Thinkpad X13s Reviewed-by: Konrad Dybcio Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/20230327122948.4323-1-johan+linaro@kernel.org --- arch/arm64/boot/dts/qcom/sc8280xp-pmics.dtsi | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/qcom/sc8280xp-pmics.dtsi b/arch/arm64/boot/dts/qcom/sc8280xp-pmics.dtsi index df7d28f7ae60..be446eba4fa7 100644 --- a/arch/arm64/boot/dts/qcom/sc8280xp-pmics.dtsi +++ b/arch/arm64/boot/dts/qcom/sc8280xp-pmics.dtsi @@ -59,8 +59,9 @@ #size-cells = <0>; pmk8280_pon: pon@1300 { - compatible = "qcom,pm8998-pon"; - reg = <0x1300>; + compatible = "qcom,pmk8350-pon"; + reg = <0x1300>, <0x800>; + reg-names = "hlos", "pbs"; pmk8280_pon_pwrkey: pwrkey { compatible = "qcom,pmk8350-pwrkey"; From 4b6d621c9d859ff89e68cebf6178652592676013 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 1 Apr 2023 22:03:27 +0200 Subject: [PATCH 042/175] memstick: fix memory leak if card device is never registered When calling dev_set_name() memory is allocated for the name for the struct device. Once that structure device is registered, or attempted to be registerd, with the driver core, the driver core will handle cleaning up that memory when the device is removed from the system. Unfortunatly for the memstick code, there is an error path that causes the struct device to never be registered, and so the memory allocated in dev_set_name will be leaked. Fix that leak by manually freeing it right before the memory for the device is freed. Cc: Maxim Levitsky Cc: Alex Dubov Cc: Ulf Hansson Cc: "Rafael J. Wysocki" Cc: Hans de Goede Cc: Kay Sievers Cc: linux-mmc@vger.kernel.org Fixes: 0252c3b4f018 ("memstick: struct device - replace bus_id with dev_name(), dev_set_name()") Cc: stable Co-developed-by: Greg Kroah-Hartman Signed-off-by: Greg Kroah-Hartman Co-developed-by: Mirsad Goran Todorovac Signed-off-by: Mirsad Goran Todorovac Link: https://lore.kernel.org/r/20230401200327.16800-1-gregkh@linuxfoundation.org Signed-off-by: Ulf Hansson --- drivers/memstick/core/memstick.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/memstick/core/memstick.c b/drivers/memstick/core/memstick.c index bf7667845459..bbfaf6536903 100644 --- a/drivers/memstick/core/memstick.c +++ b/drivers/memstick/core/memstick.c @@ -410,6 +410,7 @@ static struct memstick_dev *memstick_alloc_card(struct memstick_host *host) return card; err_out: host->card = old_card; + kfree_const(card->dev.kobj.name); kfree(card); return NULL; } @@ -468,8 +469,10 @@ static void memstick_check(struct work_struct *work) put_device(&card->dev); host->card = NULL; } - } else + } else { + kfree_const(card->dev.kobj.name); kfree(card); + } } out_power_off: From 85af7ffd24da38e416a14bd6bf207154d94faa83 Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Mon, 27 Mar 2023 18:03:21 +0800 Subject: [PATCH 043/175] arm64: dts: imx8mm-evk: correct pmic clock source The osc_32k supports #clock-cells as 0, using an id is wrong, drop it. Fixes: a6a355ede574 ("arm64: dts: imx8mm-evk: Add 32.768 kHz clock to PMIC") Signed-off-by: Peng Fan Reviewed-by: Marco Felsch Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8mm-evk.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/freescale/imx8mm-evk.dtsi b/arch/arm64/boot/dts/freescale/imx8mm-evk.dtsi index d1a6390976a9..3f9dfd4d3884 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm-evk.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mm-evk.dtsi @@ -194,7 +194,7 @@ rohm,reset-snvs-powered; #clock-cells = <0>; - clocks = <&osc_32k 0>; + clocks = <&osc_32k>; clock-output-names = "clk-32k-out"; regulators { From 130c1f4306d56301216baaea68afdd909892c73f Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Tue, 28 Mar 2023 14:19:04 +0800 Subject: [PATCH 044/175] arm64: dts: imx8mm-verdin: correct off-on-delay The property should be off-on-delay-us, not off-on-delay Fixes: 6a57f224f734 ("arm64: dts: freescale: add initial support for verdin imx8m mini") Signed-off-by: Peng Fan Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi b/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi index 88321b5b0693..6f0811587142 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi @@ -99,7 +99,7 @@ compatible = "regulator-fixed"; enable-active-high; gpio = <&gpio2 20 GPIO_ACTIVE_HIGH>; /* PMIC_EN_ETH */ - off-on-delay = <500000>; + off-on-delay-us = <500000>; pinctrl-names = "default"; pinctrl-0 = <&pinctrl_reg_eth>; regulator-always-on; @@ -139,7 +139,7 @@ enable-active-high; /* Verdin SD_1_PWR_EN (SODIMM 76) */ gpio = <&gpio3 5 GPIO_ACTIVE_HIGH>; - off-on-delay = <100000>; + off-on-delay-us = <100000>; pinctrl-names = "default"; pinctrl-0 = <&pinctrl_usdhc2_pwr_en>; regulator-max-microvolt = <3300000>; From 02c447a0d79f0c966563e5095a017cbf9477ca6d Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Tue, 28 Mar 2023 14:19:05 +0800 Subject: [PATCH 045/175] arm64: dts: imx8mp-verdin: correct off-on-delay The property should be off-on-delay-us, not off-on-delay Fixes: a39ed23bdf6e ("arm64: dts: freescale: add initial support for verdin imx8m plus") Signed-off-by: Peng Fan Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8mp-verdin-dev.dtsi | 2 +- arch/arm64/boot/dts/freescale/imx8mp-verdin.dtsi | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/boot/dts/freescale/imx8mp-verdin-dev.dtsi b/arch/arm64/boot/dts/freescale/imx8mp-verdin-dev.dtsi index 361426c0da0a..c29622529200 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp-verdin-dev.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mp-verdin-dev.dtsi @@ -10,7 +10,7 @@ compatible = "regulator-fixed"; enable-active-high; gpio = <&gpio_expander_21 4 GPIO_ACTIVE_HIGH>; /* ETH_PWR_EN */ - off-on-delay = <500000>; + off-on-delay-us = <500000>; regulator-max-microvolt = <3300000>; regulator-min-microvolt = <3300000>; regulator-name = "+V3.3_ETH"; diff --git a/arch/arm64/boot/dts/freescale/imx8mp-verdin.dtsi b/arch/arm64/boot/dts/freescale/imx8mp-verdin.dtsi index 0dd6180a8e39..1608775da0ad 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp-verdin.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mp-verdin.dtsi @@ -87,7 +87,7 @@ compatible = "regulator-fixed"; enable-active-high; gpio = <&gpio2 20 GPIO_ACTIVE_HIGH>; /* PMIC_EN_ETH */ - off-on-delay = <500000>; + off-on-delay-us = <500000>; pinctrl-names = "default"; pinctrl-0 = <&pinctrl_reg_eth>; regulator-always-on; @@ -128,7 +128,7 @@ enable-active-high; /* Verdin SD_1_PWR_EN (SODIMM 76) */ gpio = <&gpio4 22 GPIO_ACTIVE_HIGH>; - off-on-delay = <100000>; + off-on-delay-us = <100000>; pinctrl-names = "default"; pinctrl-0 = <&pinctrl_usdhc2_pwr_en>; regulator-max-microvolt = <3300000>; From 82655f90701de5e0f7381b16534602bc2b8fe920 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Tue, 28 Mar 2023 15:51:46 -0300 Subject: [PATCH 046/175] ARM: dts: imx7d-remarkable2: Remove unnecessary #address-cells/#size-cells Building with W=1 leads to the following dtc warning: arch/arm/boot/dts/imx7d-remarkable2.dts:319.19-335.4: Warning (avoid_unnecessary_addr_size): /soc/bus@30800000/i2c@30a50000/pmic@62: unnecessary #address-cells/#size-cells without "ranges" or child "reg" property Remove unnecessary #address-cells/#size-cells to fix it. Fixes: 9076cbaa7757 ("ARM: dts: imx7d-remarkable2: Enable silergy,sy7636a") Signed-off-by: Fabio Estevam Reviewed-by: Alistair Francis Signed-off-by: Shawn Guo --- arch/arm/boot/dts/imx7d-remarkable2.dts | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/arm/boot/dts/imx7d-remarkable2.dts b/arch/arm/boot/dts/imx7d-remarkable2.dts index 8b2f11e85e05..427f8d04ec89 100644 --- a/arch/arm/boot/dts/imx7d-remarkable2.dts +++ b/arch/arm/boot/dts/imx7d-remarkable2.dts @@ -118,8 +118,6 @@ reg = <0x62>; pinctrl-names = "default"; pinctrl-0 = <&pinctrl_epdpmic>; - #address-cells = <1>; - #size-cells = <0>; #thermal-sensor-cells = <0>; epd-pwr-good-gpios = <&gpio6 21 GPIO_ACTIVE_HIGH>; From 3847e716b68e871ab64fc0cdad7fac9b7c1b022d Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Tue, 28 Mar 2023 15:51:47 -0300 Subject: [PATCH 047/175] ARM: dts: imx6ull-colibri: Remove unnecessary #address-cells/#size-cells Building with W=1 leads to the following dtc warning: arch/arm/boot/dts/imx6ull-colibri.dtsi:36.9-46.5: Warning (graph_child_address): /connector/ports: graph node has single child node 'port@0', #address-cells/#size-cells are not necessary Since a single port is used, 'ports' can be removed, as well as the unnecessary #address-cells/#size-cells. Fixes: bd5880e10982 ("ARM: dts: colibri-imx6ull: Enable dual-role switching") Signed-off-by: Fabio Estevam Signed-off-by: Shawn Guo --- arch/arm/boot/dts/imx6ull-colibri.dtsi | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/arch/arm/boot/dts/imx6ull-colibri.dtsi b/arch/arm/boot/dts/imx6ull-colibri.dtsi index bf64ba84b358..fde8a19aac0f 100644 --- a/arch/arm/boot/dts/imx6ull-colibri.dtsi +++ b/arch/arm/boot/dts/imx6ull-colibri.dtsi @@ -33,15 +33,9 @@ self-powered; type = "micro"; - ports { - #address-cells = <1>; - #size-cells = <0>; - - port@0 { - reg = <0>; - usb_dr_connector: endpoint { - remote-endpoint = <&usb1_drd_sw>; - }; + port { + usb_dr_connector: endpoint { + remote-endpoint = <&usb1_drd_sw>; }; }; }; From 5438b349c0512a6fe023976aad8b9f19ca671dd1 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sun, 26 Mar 2023 22:45:19 +0200 Subject: [PATCH 048/175] arm64: dts: rockchip: use just "port" in panel on Pinebook Pro The panel bindings expect to have only one port, thus they do not allow to use "ports" node: rk3399-pinebook-pro.dtb: edp-panel: 'ports' does not match any of the regexes: 'pinctrl-[0-9]+' Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20230326204520.80859-2-krzysztof.kozlowski@linaro.org Signed-off-by: Heiko Stuebner --- .../boot/dts/rockchip/rk3399-pinebook-pro.dts | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3399-pinebook-pro.dts b/arch/arm64/boot/dts/rockchip/rk3399-pinebook-pro.dts index 54bb0398128f..ddd45de97950 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399-pinebook-pro.dts +++ b/arch/arm64/boot/dts/rockchip/rk3399-pinebook-pro.dts @@ -50,19 +50,9 @@ pinctrl-0 = <&panel_en_pin>; power-supply = <&vcc3v3_panel>; - ports { - #address-cells = <1>; - #size-cells = <0>; - - port@0 { - reg = <0>; - #address-cells = <1>; - #size-cells = <0>; - - panel_in_edp: endpoint@0 { - reg = <0>; - remote-endpoint = <&edp_out_panel>; - }; + port { + panel_in_edp: endpoint { + remote-endpoint = <&edp_out_panel>; }; }; }; From 2dd16a23e8c687bde605dbdcfedaed97bb2a0c0e Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sun, 26 Mar 2023 22:45:20 +0200 Subject: [PATCH 049/175] arm64: dts: rockchip: use just "port" in panel on RockPro64 The panel bindings expect to have only one port, thus they do not allow to use "ports" node: rk3399-rockpro64.dtb: panel@0: 'ports' does not match any of the regexes: 'pinctrl-[0-9]+' There is only one endpoint, so use simpler form without "reg". Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20230326204520.80859-3-krzysztof.kozlowski@linaro.org Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3399-rockpro64.dtsi | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3399-rockpro64.dtsi b/arch/arm64/boot/dts/rockchip/rk3399-rockpro64.dtsi index 78157521e944..bca2b50e0a93 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399-rockpro64.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3399-rockpro64.dtsi @@ -647,16 +647,10 @@ avdd-supply = <&avdd>; backlight = <&backlight>; dvdd-supply = <&vcc3v3_s0>; - ports { - #address-cells = <1>; - #size-cells = <0>; - port@0 { - reg = <0>; - - mipi_in_panel: endpoint { - remote-endpoint = <&mipi_out_panel>; - }; + port { + mipi_in_panel: endpoint { + remote-endpoint = <&mipi_out_panel>; }; }; }; From 60a655debd36e3278a46872accc1a51a54f94f02 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sun, 26 Mar 2023 22:45:18 +0200 Subject: [PATCH 050/175] arm64: dts: rockchip: correct panel supplies on some rk3326 boards The Anbernic and Odroid Go have different panels and take differently named supplies, so move all the supplies to DTS defining actual panel to fix warnings like: rk3326-odroid-go3.dtb: panel@0: 'IOVCC-supply' is a required property rk3326-odroid-go3.dtb: panel@0: 'iovcc-supply', 'vdd-supply' do not match any of the regexes: 'pinctrl-[0-9]+' Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20230326204520.80859-1-krzysztof.kozlowski@linaro.org Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3326-anbernic-rg351m.dts | 2 ++ arch/arm64/boot/dts/rockchip/rk3326-odroid-go.dtsi | 2 -- arch/arm64/boot/dts/rockchip/rk3326-odroid-go2-v11.dts | 2 ++ arch/arm64/boot/dts/rockchip/rk3326-odroid-go2.dts | 2 ++ 4 files changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3326-anbernic-rg351m.dts b/arch/arm64/boot/dts/rockchip/rk3326-anbernic-rg351m.dts index 61b31688b469..ce318e05f0a6 100644 --- a/arch/arm64/boot/dts/rockchip/rk3326-anbernic-rg351m.dts +++ b/arch/arm64/boot/dts/rockchip/rk3326-anbernic-rg351m.dts @@ -24,6 +24,8 @@ &internal_display { compatible = "elida,kd35t133"; + iovcc-supply = <&vcc_lcd>; + vdd-supply = <&vcc_lcd>; }; &pwm0 { diff --git a/arch/arm64/boot/dts/rockchip/rk3326-odroid-go.dtsi b/arch/arm64/boot/dts/rockchip/rk3326-odroid-go.dtsi index 04eba432fb0e..80fc53c807a4 100644 --- a/arch/arm64/boot/dts/rockchip/rk3326-odroid-go.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3326-odroid-go.dtsi @@ -235,10 +235,8 @@ internal_display: panel@0 { reg = <0>; backlight = <&backlight>; - iovcc-supply = <&vcc_lcd>; reset-gpios = <&gpio3 RK_PC0 GPIO_ACTIVE_LOW>; rotation = <270>; - vdd-supply = <&vcc_lcd>; port { mipi_in_panel: endpoint { diff --git a/arch/arm64/boot/dts/rockchip/rk3326-odroid-go2-v11.dts b/arch/arm64/boot/dts/rockchip/rk3326-odroid-go2-v11.dts index 139c898e590e..d94ac81eb4e6 100644 --- a/arch/arm64/boot/dts/rockchip/rk3326-odroid-go2-v11.dts +++ b/arch/arm64/boot/dts/rockchip/rk3326-odroid-go2-v11.dts @@ -83,6 +83,8 @@ &internal_display { compatible = "elida,kd35t133"; + iovcc-supply = <&vcc_lcd>; + vdd-supply = <&vcc_lcd>; }; &rk817 { diff --git a/arch/arm64/boot/dts/rockchip/rk3326-odroid-go2.dts b/arch/arm64/boot/dts/rockchip/rk3326-odroid-go2.dts index 4702183b673c..aa6f5b12206b 100644 --- a/arch/arm64/boot/dts/rockchip/rk3326-odroid-go2.dts +++ b/arch/arm64/boot/dts/rockchip/rk3326-odroid-go2.dts @@ -59,6 +59,8 @@ &internal_display { compatible = "elida,kd35t133"; + iovcc-supply = <&vcc_lcd>; + vdd-supply = <&vcc_lcd>; }; &rk817_charger { From 86d5b27b379256cd5d48974b4cd7ad03091eea6b Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Tue, 4 Apr 2023 09:13:03 -0300 Subject: [PATCH 051/175] ARM: imx_v6_v7_defconfig: Fix unintentional disablement of PCI Since commit 75c2f26da03f ("PCI: imx6: Add i.MX PCIe EP mode support") the i.MX6 PCI driver is no longer selected by default. The existing PCI_IMX6 was made a hidden option, selected by new options PCI_IMX6_HOST (for the existing support) and PCI_IMX6_EP (for the endpoint mode), but there has been no corresponding update to imx_v6_v7_defconfig so the PCI_IMX6 ends up getting disabled. Switch imx_v6_v7_defconfig to PCI_IMX6_HOST to preserve the existing functionality. This is based on the same fix done in commit 0cd5780eb625 ("arm64: defconfig: Fix unintentional disablement of PCI on i.MX"). Fixes: 75c2f26da03f ("PCI: imx6: Add i.MX PCIe EP mode support") Reported-by: Mattias Barthel Signed-off-by: Fabio Estevam Signed-off-by: Shawn Guo --- arch/arm/configs/imx_v6_v7_defconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/configs/imx_v6_v7_defconfig b/arch/arm/configs/imx_v6_v7_defconfig index 6dc6fed12af8..8d002c6e6cb3 100644 --- a/arch/arm/configs/imx_v6_v7_defconfig +++ b/arch/arm/configs/imx_v6_v7_defconfig @@ -76,7 +76,7 @@ CONFIG_RFKILL=y CONFIG_RFKILL_INPUT=y CONFIG_PCI=y CONFIG_PCI_MSI=y -CONFIG_PCI_IMX6=y +CONFIG_PCI_IMX6_HOST=y CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y # CONFIG_STANDALONE is not set From e98e7a82bca2b6dce3e03719cff800ec913f9af7 Mon Sep 17 00:00:00 2001 From: Oswald Buddenhagen Date: Wed, 5 Apr 2023 22:12:19 +0200 Subject: [PATCH 052/175] ALSA: i2c/cs8427: fix iec958 mixer control deactivation snd_cs8427_iec958_active() would always delete SNDRV_CTL_ELEM_ACCESS_INACTIVE, even though the function has an argument `active`. Signed-off-by: Oswald Buddenhagen Cc: Link: https://lore.kernel.org/r/20230405201219.2197811-1-oswald.buddenhagen@gmx.de Signed-off-by: Takashi Iwai --- sound/i2c/cs8427.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sound/i2c/cs8427.c b/sound/i2c/cs8427.c index 65012af6a36e..f58b14b49045 100644 --- a/sound/i2c/cs8427.c +++ b/sound/i2c/cs8427.c @@ -561,10 +561,13 @@ int snd_cs8427_iec958_active(struct snd_i2c_device *cs8427, int active) if (snd_BUG_ON(!cs8427)) return -ENXIO; chip = cs8427->private_data; - if (active) + if (active) { memcpy(chip->playback.pcm_status, chip->playback.def_status, 24); - chip->playback.pcm_ctl->vd[0].access &= ~SNDRV_CTL_ELEM_ACCESS_INACTIVE; + chip->playback.pcm_ctl->vd[0].access &= ~SNDRV_CTL_ELEM_ACCESS_INACTIVE; + } else { + chip->playback.pcm_ctl->vd[0].access |= SNDRV_CTL_ELEM_ACCESS_INACTIVE; + } snd_ctl_notify(cs8427->bus->card, SNDRV_CTL_EVENT_MASK_VALUE | SNDRV_CTL_EVENT_MASK_INFO, &chip->playback.pcm_ctl->id); From c17f8fd31700392b1bb9e7b66924333568cb3700 Mon Sep 17 00:00:00 2001 From: Oswald Buddenhagen Date: Wed, 5 Apr 2023 22:12:19 +0200 Subject: [PATCH 053/175] ALSA: hda/sigmatel: add pin overrides for Intel DP45SG motherboard Like the other boards from the D*45* series, this one sets up the outputs not quite correctly. Signed-off-by: Oswald Buddenhagen Cc: Link: https://lore.kernel.org/r/20230405201220.2197826-1-oswald.buddenhagen@gmx.de Signed-off-by: Takashi Iwai --- Documentation/sound/hd-audio/models.rst | 2 +- sound/pci/hda/patch_sigmatel.c | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/Documentation/sound/hd-audio/models.rst b/Documentation/sound/hd-audio/models.rst index 9b52f50a6854..120430450014 100644 --- a/Documentation/sound/hd-audio/models.rst +++ b/Documentation/sound/hd-audio/models.rst @@ -704,7 +704,7 @@ ref no-jd BIOS setup but without jack-detection intel - Intel DG45* mobos + Intel D*45* mobos dell-m6-amic Dell desktops/laptops with analog mics dell-m6-dmic diff --git a/sound/pci/hda/patch_sigmatel.c b/sound/pci/hda/patch_sigmatel.c index a794a01a68ca..64a97d8c1b03 100644 --- a/sound/pci/hda/patch_sigmatel.c +++ b/sound/pci/hda/patch_sigmatel.c @@ -1955,6 +1955,8 @@ static const struct snd_pci_quirk stac92hd73xx_fixup_tbl[] = { "DFI LanParty", STAC_92HD73XX_REF), SND_PCI_QUIRK(PCI_VENDOR_ID_DFI, 0x3101, "DFI LanParty", STAC_92HD73XX_REF), + SND_PCI_QUIRK(PCI_VENDOR_ID_INTEL, 0x5001, + "Intel DP45SG", STAC_92HD73XX_INTEL), SND_PCI_QUIRK(PCI_VENDOR_ID_INTEL, 0x5002, "Intel DG45ID", STAC_92HD73XX_INTEL), SND_PCI_QUIRK(PCI_VENDOR_ID_INTEL, 0x5003, From f342ac00da1064eb4f94b1f4bcacbdfea955797a Mon Sep 17 00:00:00 2001 From: Oswald Buddenhagen Date: Wed, 5 Apr 2023 22:12:20 +0200 Subject: [PATCH 054/175] ALSA: hda/sigmatel: fix S/PDIF out on Intel D*45* motherboards The BIOS botches this one completely - it says the 2nd S/PDIF output is used, while in fact it's the 1st one. This is tested on DP45SG, but I'm assuming it's valid for the other boards in the series as well. Also add some comments regarding the pins. FWIW, the codec is apparently still sold by Tempo Semiconductor, Inc., where one can download the documentation. Signed-off-by: Oswald Buddenhagen Cc: Link: https://lore.kernel.org/r/20230405201220.2197826-2-oswald.buddenhagen@gmx.de Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_sigmatel.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/sound/pci/hda/patch_sigmatel.c b/sound/pci/hda/patch_sigmatel.c index 64a97d8c1b03..61258b0aac8d 100644 --- a/sound/pci/hda/patch_sigmatel.c +++ b/sound/pci/hda/patch_sigmatel.c @@ -1707,6 +1707,7 @@ static const struct snd_pci_quirk stac925x_fixup_tbl[] = { }; static const struct hda_pintbl ref92hd73xx_pin_configs[] = { + // Port A-H { 0x0a, 0x02214030 }, { 0x0b, 0x02a19040 }, { 0x0c, 0x01a19020 }, @@ -1715,9 +1716,12 @@ static const struct hda_pintbl ref92hd73xx_pin_configs[] = { { 0x0f, 0x01014010 }, { 0x10, 0x01014020 }, { 0x11, 0x01014030 }, + // CD in { 0x12, 0x02319040 }, + // Digial Mic ins { 0x13, 0x90a000f0 }, { 0x14, 0x90a000f0 }, + // Digital outs { 0x22, 0x01452050 }, { 0x23, 0x01452050 }, {} @@ -1758,6 +1762,7 @@ static const struct hda_pintbl alienware_m17x_pin_configs[] = { }; static const struct hda_pintbl intel_dg45id_pin_configs[] = { + // Analog outputs { 0x0a, 0x02214230 }, { 0x0b, 0x02A19240 }, { 0x0c, 0x01013214 }, @@ -1765,6 +1770,9 @@ static const struct hda_pintbl intel_dg45id_pin_configs[] = { { 0x0e, 0x01A19250 }, { 0x0f, 0x01011212 }, { 0x10, 0x01016211 }, + // Digital output + { 0x22, 0x01451380 }, + { 0x23, 0x40f000f0 }, {} }; From b09c551c77c7e01dc6e4f3c8bf06b5ffa7b06db5 Mon Sep 17 00:00:00 2001 From: Oswald Buddenhagen Date: Wed, 5 Apr 2023 22:12:20 +0200 Subject: [PATCH 055/175] ALSA: emu10k1: fix capture interrupt handler unlinking Due to two copy/pastos, closing the MIC or EFX capture device would make a running ADC capture hang due to unsetting its interrupt handler. In principle, this would have also allowed dereferencing dangling pointers, but we're actually rather thorough at disabling and flushing the ints. While it may sound like one, this actually wasn't a hypothetical bug: PortAudio will open a capture stream at startup (and close it right away) even if not asked to. If the first device is busy, it will just proceed with the next one ... thus killing a concurrent capture. Signed-off-by: Oswald Buddenhagen Cc: Link: https://lore.kernel.org/r/20230405201220.2197923-1-oswald.buddenhagen@gmx.de Signed-off-by: Takashi Iwai --- sound/pci/emu10k1/emupcm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/pci/emu10k1/emupcm.c b/sound/pci/emu10k1/emupcm.c index 48af77ae8020..908f76f1bb9f 100644 --- a/sound/pci/emu10k1/emupcm.c +++ b/sound/pci/emu10k1/emupcm.c @@ -1236,7 +1236,7 @@ static int snd_emu10k1_capture_mic_close(struct snd_pcm_substream *substream) { struct snd_emu10k1 *emu = snd_pcm_substream_chip(substream); - emu->capture_interrupt = NULL; + emu->capture_mic_interrupt = NULL; emu->pcm_capture_mic_substream = NULL; return 0; } @@ -1344,7 +1344,7 @@ static int snd_emu10k1_capture_efx_close(struct snd_pcm_substream *substream) { struct snd_emu10k1 *emu = snd_pcm_substream_chip(substream); - emu->capture_interrupt = NULL; + emu->capture_efx_interrupt = NULL; emu->pcm_capture_efx_substream = NULL; return 0; } From 8dd13214a810c695044aa168c0ddba1a9c433e4f Mon Sep 17 00:00:00 2001 From: Oswald Buddenhagen Date: Wed, 5 Apr 2023 22:12:20 +0200 Subject: [PATCH 056/175] ALSA: emu10k1: don't create old pass-through playback device on Audigy It could have never worked, as snd_emu10k1_fx8010_playback_prepare() and snd_emu10k1_fx8010_playback_hw_free() assume the emu10k1 offset for the ETRAM, and the default DSP code includes no handler for it. It also wouldn't make a lot of sense to make it work, as Audigy has an own, much simpler, pass-through mechanism. So just skip creation of the device. Signed-off-by: Oswald Buddenhagen Cc: Link: https://lore.kernel.org/r/20230405201220.2197938-1-oswald.buddenhagen@gmx.de Signed-off-by: Takashi Iwai --- sound/pci/emu10k1/emupcm.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/sound/pci/emu10k1/emupcm.c b/sound/pci/emu10k1/emupcm.c index 908f76f1bb9f..6ec394fb1846 100644 --- a/sound/pci/emu10k1/emupcm.c +++ b/sound/pci/emu10k1/emupcm.c @@ -1781,17 +1781,21 @@ int snd_emu10k1_pcm_efx(struct snd_emu10k1 *emu, int device) struct snd_kcontrol *kctl; int err; - err = snd_pcm_new(emu->card, "emu10k1 efx", device, 8, 1, &pcm); + err = snd_pcm_new(emu->card, "emu10k1 efx", device, emu->audigy ? 0 : 8, 1, &pcm); if (err < 0) return err; pcm->private_data = emu; - snd_pcm_set_ops(pcm, SNDRV_PCM_STREAM_PLAYBACK, &snd_emu10k1_fx8010_playback_ops); + if (!emu->audigy) + snd_pcm_set_ops(pcm, SNDRV_PCM_STREAM_PLAYBACK, &snd_emu10k1_fx8010_playback_ops); snd_pcm_set_ops(pcm, SNDRV_PCM_STREAM_CAPTURE, &snd_emu10k1_capture_efx_ops); pcm->info_flags = 0; - strcpy(pcm->name, "Multichannel Capture/PT Playback"); + if (emu->audigy) + strcpy(pcm->name, "Multichannel Capture"); + else + strcpy(pcm->name, "Multichannel Capture/PT Playback"); emu->pcm_efx = pcm; /* EFX capture - record the "FXBUS2" channels, by default we connect the EXTINs From 94623f579ce338b5fa61b5acaa5beb8aa657fb9e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 3 Apr 2023 13:54:37 +0200 Subject: [PATCH 057/175] netfilter: br_netfilter: fix recent physdev match breakage Recent attempt to ensure PREROUTING hook is executed again when a decrypted ipsec packet received on a bridge passes through the network stack a second time broke the physdev match in INPUT hook. We can't discard the nf_bridge info strct from sabotage_in hook, as this is needed by the physdev match. Keep the struct around and handle this with another conditional instead. Fixes: 2b272bb558f1 ("netfilter: br_netfilter: disable sabotage_in hook after first suppression") Reported-and-tested-by: Farid BENAMROUCHE Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/skbuff.h | 1 + net/bridge/br_netfilter_hooks.c | 17 +++++++++++------ 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index ff7ad331fb82..1ec3530c8191 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -294,6 +294,7 @@ struct nf_bridge_info { u8 pkt_otherhost:1; u8 in_prerouting:1; u8 bridged_dnat:1; + u8 sabotage_in_done:1; __u16 frag_max_size; struct net_device *physindev; diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 638a4d5359db..4bc6761517bb 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -868,12 +868,17 @@ static unsigned int ip_sabotage_in(void *priv, { struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); - if (nf_bridge && !nf_bridge->in_prerouting && - !netif_is_l3_master(skb->dev) && - !netif_is_l3_slave(skb->dev)) { - nf_bridge_info_free(skb); - state->okfn(state->net, state->sk, skb); - return NF_STOLEN; + if (nf_bridge) { + if (nf_bridge->sabotage_in_done) + return NF_ACCEPT; + + if (!nf_bridge->in_prerouting && + !netif_is_l3_master(skb->dev) && + !netif_is_l3_slave(skb->dev)) { + nf_bridge->sabotage_in_done = 1; + state->okfn(state->net, state->sk, skb); + return NF_STOLEN; + } } return NF_ACCEPT; From af0acf22aea359e04412237d68787401f96bb583 Mon Sep 17 00:00:00 2001 From: Chen Aotian Date: Thu, 6 Apr 2023 12:01:51 +0800 Subject: [PATCH 058/175] netfilter: nf_tables: Modify nla_memdup's flag to GFP_KERNEL_ACCOUNT For memory alloc that store user data from nla[NFTA_OBJ_USERDATA], use GFP_KERNEL_ACCOUNT is more suitable. Fixes: 33758c891479 ("memcg: enable accounting for nft objects") Signed-off-by: Chen Aotian Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 6004d4b24451..cd52109e674a 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -7052,7 +7052,7 @@ static int nf_tables_newobj(struct sk_buff *skb, const struct nfnl_info *info, } if (nla[NFTA_OBJ_USERDATA]) { - obj->udata = nla_memdup(nla[NFTA_OBJ_USERDATA], GFP_KERNEL); + obj->udata = nla_memdup(nla[NFTA_OBJ_USERDATA], GFP_KERNEL_ACCOUNT); if (obj->udata == NULL) goto err_userdata; From fb4a624f88f658c7b7ae124452bd42eaa8ac7168 Mon Sep 17 00:00:00 2001 From: Xu Biang Date: Thu, 6 Apr 2023 06:28:01 -0700 Subject: [PATCH 059/175] ALSA: firewire-tascam: add missing unwind goto in snd_tscm_stream_start_duplex() Smatch Warns: sound/firewire/tascam/tascam-stream.c:493 snd_tscm_stream_start_duplex() warn: missing unwind goto? The direct return will cause the stream list of "&tscm->domain" unemptied and the session in "tscm" unfinished if amdtp_domain_start() returns with an error. Fix this by changing the direct return to a goto which will empty the stream list of "&tscm->domain" and finish the session in "tscm". The snd_tscm_stream_start_duplex() function is called in the prepare callback of PCM. According to "ALSA Kernel API Documentation", the prepare callback of PCM will be called many times at each setup. So, if the "&d->streams" list is not emptied, when the prepare callback is called next time, snd_tscm_stream_start_duplex() will receive -EBUSY from amdtp_domain_add_stream() that tries to add an existing stream to the domain. The error handling code after the "error" label will be executed in this case, and the "&d->streams" list will be emptied. So not emptying the "&d->streams" list will not cause an issue. But it is more efficient and readable to empty it on the first error by changing the direct return to a goto statement. The session in "tscm" has been begun before amdtp_domain_start(), so it needs to be finished when amdtp_domain_start() fails. Fixes: c281d46a51e3 ("ALSA: firewire-tascam: support AMDTP domain") Signed-off-by: Xu Biang Reviewed-by: Dan Carpenter Acked-by: Takashi Sakamoto Cc: Link: https://lore.kernel.org/r/20230406132801.105108-1-xubiang@hust.edu.cn Signed-off-by: Takashi Iwai --- sound/firewire/tascam/tascam-stream.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/firewire/tascam/tascam-stream.c b/sound/firewire/tascam/tascam-stream.c index 53e094cc411f..dfe783d01d7d 100644 --- a/sound/firewire/tascam/tascam-stream.c +++ b/sound/firewire/tascam/tascam-stream.c @@ -490,7 +490,7 @@ int snd_tscm_stream_start_duplex(struct snd_tscm *tscm, unsigned int rate) // packet is important for media clock recovery. err = amdtp_domain_start(&tscm->domain, tx_init_skip_cycles, true, true); if (err < 0) - return err; + goto error; if (!amdtp_domain_wait_ready(&tscm->domain, READY_TIMEOUT_MS)) { err = -ETIMEDOUT; From e959f2beec8e655dba79c5a7111beedae5e757e0 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Thu, 6 Apr 2023 10:27:25 -0500 Subject: [PATCH 060/175] ALSA: hda: patch_realtek: add quirk for Asus N7601ZM Add pins and verbs needed to enable speakers and jack. The pins and verbs configurations were identified by snooping the Windows driver commands, with a nice write-up here: https://brakkee.org/site/2023/02/07/fixing-sound-on-the-asus-n7601zm/ Reported-by: Erik Brakkee Link: https://github.com/thesofproject/linux/issues/4176 Tested-by: Erik Brakkee Signed-off-by: Pierre-Louis Bossart Reviewed-by: Kai Vehmanen Reviewed-by: Bard Liao Cc: Link: https://lore.kernel.org/r/20230406152725.15191-1-pierre-louis.bossart@linux.intel.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 26187f5d56b5..f554db1e7e9a 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -6960,6 +6960,8 @@ enum { ALC269_FIXUP_DELL_M101Z, ALC269_FIXUP_SKU_IGNORE, ALC269_FIXUP_ASUS_G73JW, + ALC269_FIXUP_ASUS_N7601ZM_PINS, + ALC269_FIXUP_ASUS_N7601ZM, ALC269_FIXUP_LENOVO_EAPD, ALC275_FIXUP_SONY_HWEQ, ALC275_FIXUP_SONY_DISABLE_AAMIX, @@ -7256,6 +7258,29 @@ static const struct hda_fixup alc269_fixups[] = { { } } }, + [ALC269_FIXUP_ASUS_N7601ZM_PINS] = { + .type = HDA_FIXUP_PINS, + .v.pins = (const struct hda_pintbl[]) { + { 0x19, 0x03A11050 }, + { 0x1a, 0x03A11C30 }, + { 0x21, 0x03211420 }, + { } + } + }, + [ALC269_FIXUP_ASUS_N7601ZM] = { + .type = HDA_FIXUP_VERBS, + .v.verbs = (const struct hda_verb[]) { + {0x20, AC_VERB_SET_COEF_INDEX, 0x62}, + {0x20, AC_VERB_SET_PROC_COEF, 0xa007}, + {0x20, AC_VERB_SET_COEF_INDEX, 0x10}, + {0x20, AC_VERB_SET_PROC_COEF, 0x8420}, + {0x20, AC_VERB_SET_COEF_INDEX, 0x0f}, + {0x20, AC_VERB_SET_PROC_COEF, 0x7774}, + { } + }, + .chained = true, + .chain_id = ALC269_FIXUP_ASUS_N7601ZM_PINS, + }, [ALC269_FIXUP_LENOVO_EAPD] = { .type = HDA_FIXUP_VERBS, .v.verbs = (const struct hda_verb[]) { @@ -9466,6 +9491,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1043, 0x1271, "ASUS X430UN", ALC256_FIXUP_ASUS_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1043, 0x1290, "ASUS X441SA", ALC233_FIXUP_EAPD_COEF_AND_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1043, 0x12a0, "ASUS X441UV", ALC233_FIXUP_EAPD_COEF_AND_MIC_NO_PRESENCE), + SND_PCI_QUIRK(0x1043, 0x12a3, "Asus N7691ZM", ALC269_FIXUP_ASUS_N7601ZM), SND_PCI_QUIRK(0x1043, 0x12af, "ASUS UX582ZS", ALC245_FIXUP_CS35L41_SPI_2), SND_PCI_QUIRK(0x1043, 0x12e0, "ASUS X541SA", ALC256_FIXUP_ASUS_MIC), SND_PCI_QUIRK(0x1043, 0x12f0, "ASUS X541UV", ALC256_FIXUP_ASUS_MIC), From 10b6b4a8ac6120ec36555fd286eed577f7632e3b Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Fri, 31 Mar 2023 11:08:42 -0500 Subject: [PATCH 061/175] ACPI: x86: utils: Add Picasso to the list for forcing StorageD3Enable Picasso was the first APU that introduced s2idle support from AMD, and it was predating before vendors started to use `StorageD3Enable` in their firmware. Windows doesn't have problems with this hardware and NVME so it was likely on the list of hardcoded CPUs to use this behavior in Windows. Add it to the list for Linux to avoid NVME resume issues. Reported-by: Stuart Axon Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2449 Signed-off-by: Mario Limonciello Signed-off-by: Rafael J. Wysocki --- drivers/acpi/x86/utils.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/acpi/x86/utils.c b/drivers/acpi/x86/utils.c index da5727069d85..ba420a28a4aa 100644 --- a/drivers/acpi/x86/utils.c +++ b/drivers/acpi/x86/utils.c @@ -213,6 +213,7 @@ bool acpi_device_override_status(struct acpi_device *adev, unsigned long long *s disk in the system. */ static const struct x86_cpu_id storage_d3_cpu_ids[] = { + X86_MATCH_VENDOR_FAM_MODEL(AMD, 23, 24, NULL), /* Picasso */ X86_MATCH_VENDOR_FAM_MODEL(AMD, 23, 96, NULL), /* Renoir */ X86_MATCH_VENDOR_FAM_MODEL(AMD, 23, 104, NULL), /* Lucienne */ X86_MATCH_VENDOR_FAM_MODEL(AMD, 25, 80, NULL), /* Cezanne */ From 88e8c2ec4ab84f9f05ed5af9693a3972baf386c4 Mon Sep 17 00:00:00 2001 From: Patrick Blass Date: Fri, 3 Mar 2023 20:06:29 +0100 Subject: [PATCH 062/175] rust: str: fix requierments->requirements typo Fix a trivial spelling error in the `rust/kernel/str.rs` file. Fixes: 247b365dc8dc ("rust: add `kernel` crate") Reported-by: Miguel Ojeda Link: https://github.com/Rust-for-Linux/linux/issues/978 Signed-off-by: Patrick Blass Reviewed-by: Vincenzo Palazzo [Reworded slightly] Signed-off-by: Miguel Ojeda --- rust/kernel/str.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/kernel/str.rs b/rust/kernel/str.rs index b771310fa4a4..cd3d2a6cf1fc 100644 --- a/rust/kernel/str.rs +++ b/rust/kernel/str.rs @@ -408,7 +408,7 @@ impl RawFormatter { /// If `pos` is less than `end`, then the region between `pos` (inclusive) and `end` /// (exclusive) must be valid for writes for the lifetime of the returned [`RawFormatter`]. pub(crate) unsafe fn from_ptrs(pos: *mut u8, end: *mut u8) -> Self { - // INVARIANT: The safety requierments guarantee the type invariants. + // INVARIANT: The safety requirements guarantee the type invariants. Self { beg: pos as _, pos: pos as _, From 4fb9a5060f73627303bc531ceaab1b19d0a24aef Mon Sep 17 00:00:00 2001 From: Cristian Ciocaltea Date: Thu, 6 Apr 2023 20:18:00 +0300 Subject: [PATCH 063/175] regulator: fan53555: Explicitly include bits header Since commit f2a9eb975ab2 ("regulator: fan53555: Add support for FAN53526") the driver makes use of the BIT() macro, but relies on the bits header being implicitly included. Explicitly pull the header in to avoid potential build failures in some configurations. While here, reorder include directives alphabetically. Fixes: f2a9eb975ab2 ("regulator: fan53555: Add support for FAN53526") Signed-off-by: Cristian Ciocaltea Link: https://lore.kernel.org/r/20230406171806.948290-3-cristian.ciocaltea@collabora.com Signed-off-by: Mark Brown --- drivers/regulator/fan53555.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/regulator/fan53555.c b/drivers/regulator/fan53555.c index 529963a7e4f5..45f07d2ad1c5 100644 --- a/drivers/regulator/fan53555.c +++ b/drivers/regulator/fan53555.c @@ -8,18 +8,19 @@ // Copyright (c) 2012 Marvell Technology Ltd. // Yunfan Zhang -#include -#include +#include #include +#include +#include +#include +#include #include +#include #include +#include #include #include -#include -#include #include -#include -#include /* Voltage setting */ #define FAN53555_VSEL0 0x00 From c5d5b55b3c1a314137a251efc1001dfd435c6242 Mon Sep 17 00:00:00 2001 From: Cristian Ciocaltea Date: Thu, 6 Apr 2023 20:18:01 +0300 Subject: [PATCH 064/175] regulator: fan53555: Fix wrong TCS_SLEW_MASK The support for TCS4525 regulator has been introduced with a wrong ramp-rate mask, which has been defined as a logical expression instead of a bit shift operation. For clarity, fix it using GENMASK() macro. Fixes: 914df8faa7d6 ("regulator: fan53555: Add TCS4525 DCDC support") Signed-off-by: Cristian Ciocaltea Link: https://lore.kernel.org/r/20230406171806.948290-4-cristian.ciocaltea@collabora.com Signed-off-by: Mark Brown --- drivers/regulator/fan53555.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/regulator/fan53555.c b/drivers/regulator/fan53555.c index 45f07d2ad1c5..41537c45f036 100644 --- a/drivers/regulator/fan53555.c +++ b/drivers/regulator/fan53555.c @@ -61,7 +61,7 @@ #define TCS_VSEL1_MODE (1 << 6) #define TCS_SLEW_SHIFT 3 -#define TCS_SLEW_MASK (0x3 < 3) +#define TCS_SLEW_MASK GENMASK(4, 3) enum fan53555_vendor { FAN53526_VENDOR_FAIRCHILD = 0, From e5e86572e3f20222b5d308df9ae986c06f229321 Mon Sep 17 00:00:00 2001 From: Thomas Bamelis Date: Sun, 26 Feb 2023 15:29:29 +0000 Subject: [PATCH 065/175] rust: sort uml documentation arch support table The arch_support table was not sorted alphabetically. Sorts the table properly. Fixes: 0438aadfa69a ("rust: arch/um: Add support for CONFIG_RUST under x86_64 UML") Link: https://lore.kernel.org/rust-for-linux/CANiq72nXMsnUsJNZOG-QZiCVOqa9dRUSMc4RAS3ExLZNJ7VhHg@mail.gmail.com Reported-by: Miguel Ojeda Signed-off-by: Thomas Bamelis Reviewed-by: David Gow Signed-off-by: Miguel Ojeda --- Documentation/rust/arch-support.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/rust/arch-support.rst b/Documentation/rust/arch-support.rst index ed7f4f5b3cf1..b91e9ef4d0c2 100644 --- a/Documentation/rust/arch-support.rst +++ b/Documentation/rust/arch-support.rst @@ -15,7 +15,7 @@ support corresponds to ``S`` values in the ``MAINTAINERS`` file. ============ ================ ============================================== Architecture Level of support Constraints ============ ================ ============================================== -``x86`` Maintained ``x86_64`` only. ``um`` Maintained ``x86_64`` only. +``x86`` Maintained ``x86_64`` only. ============ ================ ============================================== From c682e4c37d2b8ba3bde1125cbbea4ee88824b4e2 Mon Sep 17 00:00:00 2001 From: David Gow Date: Wed, 15 Feb 2023 06:47:35 +0800 Subject: [PATCH 066/175] rust: kernel: Mark rust_fmt_argument as extern "C" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The rust_fmt_argument function is called from printk() to handle the %pA format specifier. Since it's called from C, we should mark it extern "C" to make sure it's ABI compatible. Cc: stable@vger.kernel.org Fixes: 247b365dc8dc ("rust: add `kernel` crate") Signed-off-by: David Gow Reviewed-by: Gary Guo Reviewed-by: Björn Roy Baron Reviewed-by: Vincenzo Palazzo [Applied `rustfmt`] Signed-off-by: Miguel Ojeda --- rust/kernel/print.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/rust/kernel/print.rs b/rust/kernel/print.rs index 30103325696d..8009184bf6d7 100644 --- a/rust/kernel/print.rs +++ b/rust/kernel/print.rs @@ -18,7 +18,11 @@ use crate::bindings; // Called from `vsprintf` with format specifier `%pA`. #[no_mangle] -unsafe fn rust_fmt_argument(buf: *mut c_char, end: *mut c_char, ptr: *const c_void) -> *mut c_char { +unsafe extern "C" fn rust_fmt_argument( + buf: *mut c_char, + end: *mut c_char, + ptr: *const c_void, +) -> *mut c_char { use fmt::Write; // SAFETY: The C contract guarantees that `buf` is valid if it's less than `end`. let mut w = unsafe { RawFormatter::from_ptrs(buf.cast(), end.cast()) }; From 5c7548d5a25306dcdb97689479be81cacc8ce596 Mon Sep 17 00:00:00 2001 From: Asahi Lina Date: Fri, 7 Apr 2023 00:25:22 +0200 Subject: [PATCH 067/175] scripts: generate_rust_analyzer: Handle sub-modules with no Makefile More complex drivers might want to use modules to organize their Rust code, but those module folders do not need a Makefile. generate_rust_analyzer.py currently crashes on those. Fix it so that a missing Makefile is silently ignored. Link: https://github.com/Rust-for-Linux/linux/pull/883 Signed-off-by: Asahi Lina Signed-off-by: Miguel Ojeda --- scripts/generate_rust_analyzer.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/generate_rust_analyzer.py b/scripts/generate_rust_analyzer.py index ecc7ea9a4dcf..946e250c1b2a 100755 --- a/scripts/generate_rust_analyzer.py +++ b/scripts/generate_rust_analyzer.py @@ -104,7 +104,10 @@ def generate_crates(srctree, objtree, sysroot_src): name = path.name.replace(".rs", "") # Skip those that are not crate roots. - if f"{name}.o" not in open(path.parent / "Makefile").read(): + try: + if f"{name}.o" not in open(path.parent / "Makefile").read(): + continue + except FileNotFoundError: continue logging.info("Adding %s", name) From 1c5f054f0b12875096e339861c7f44a7c952ce56 Mon Sep 17 00:00:00 2001 From: Vincenzo Palazzo Date: Thu, 2 Mar 2023 14:21:07 +0100 Subject: [PATCH 068/175] rust: build: Fix grep warning Fix grep warning during the build, with GNU grep 3.8 with the following command `grep -v '^\#\|^$$' rust/bindgen_parameters` I see the following warning ``` grep: warning: stray \ before # --opaque-type xregs_state --opaque-type desc_struct --opaque-type arch_lbr_state --opaque-type local_apic --opaque-type x86_msi_data --opaque-type x86_msi_addr_lo --opaque-type kunit_try_catch --opaque-type spinlock --no-doc-comments ``` Signed-off-by: Vincenzo Palazzo Tested-by: Martin Rodriguez Reboredo Reviewed-by: Martin Rodriguez Reboredo Signed-off-by: Miguel Ojeda --- rust/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/Makefile b/rust/Makefile index f88d108fbef0..41ac8401a8be 100644 --- a/rust/Makefile +++ b/rust/Makefile @@ -283,7 +283,7 @@ quiet_cmd_bindgen = BINDGEN $@ $(bindgen_target_cflags) $(bindgen_target_extra) $(obj)/bindings/bindings_generated.rs: private bindgen_target_flags = \ - $(shell grep -v '^\#\|^$$' $(srctree)/$(src)/bindgen_parameters) + $(shell grep -v '^#\|^$$' $(srctree)/$(src)/bindgen_parameters) $(obj)/bindings/bindings_generated.rs: $(src)/bindings/bindings_helper.h \ $(src)/bindgen_parameters FORCE $(call if_changed_dep,bindgen) From 75eab749e7aec0b7b515d7c50ed429ef4e1c5f3f Mon Sep 17 00:00:00 2001 From: Abhinav Kumar Date: Wed, 29 Mar 2023 16:34:16 -0700 Subject: [PATCH 069/175] arm64: dts: qcom: sc7280: remove hbr3 support on herobrine boards There are some interop issues seen across a few DP monitors with HBR3 and herobrine boards where the DP display stays blank with hbr3. This is still under investigation but in preparation for supporting higher resolutions, its better to disable HBR3 till the issues are root-caused as there is really no guarantee which monitors will show the issue and which would not. This can be enabled back after successful validation across more DP sinks. Signed-off-by: Abhinav Kumar Reviewed-by: Douglas Anderson Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/20230329233416.27152-1-quic_abhinavk@quicinc.com --- arch/arm64/boot/dts/qcom/sc7280-herobrine.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/qcom/sc7280-herobrine.dtsi b/arch/arm64/boot/dts/qcom/sc7280-herobrine.dtsi index b6137816f2f3..313083ec1f39 100644 --- a/arch/arm64/boot/dts/qcom/sc7280-herobrine.dtsi +++ b/arch/arm64/boot/dts/qcom/sc7280-herobrine.dtsi @@ -464,7 +464,7 @@ ap_i2c_tpm: &i2c14 { &mdss_dp_out { data-lanes = <0 1>; - link-frequencies = /bits/ 64 <1620000000 2700000000 5400000000 8100000000>; + link-frequencies = /bits/ 64 <1620000000 2700000000 5400000000>; }; &mdss_mdp { From d83806c4c0cccc0d6d3c3581a11983a9c186a138 Mon Sep 17 00:00:00 2001 From: Alyssa Ross Date: Sun, 26 Mar 2023 18:21:21 +0000 Subject: [PATCH 070/175] purgatory: fix disabling debug info Since 32ef9e5054ec, -Wa,-gdwarf-2 is no longer used in KBUILD_AFLAGS. Instead, it includes -g, the appropriate -gdwarf-* flag, and also the -Wa versions of both of those if building with Clang and GNU as. As a result, debug info was being generated for the purgatory objects, even though the intention was that it not be. Fixes: 32ef9e5054ec ("Makefile.debug: re-enable debug info for .S files") Signed-off-by: Alyssa Ross Cc: stable@vger.kernel.org Acked-by: Nick Desaulniers Signed-off-by: Masahiro Yamada --- arch/riscv/purgatory/Makefile | 7 +------ arch/x86/purgatory/Makefile | 3 +-- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/arch/riscv/purgatory/Makefile b/arch/riscv/purgatory/Makefile index d16bf715a586..5730797a6b40 100644 --- a/arch/riscv/purgatory/Makefile +++ b/arch/riscv/purgatory/Makefile @@ -84,12 +84,7 @@ CFLAGS_string.o += $(PURGATORY_CFLAGS) CFLAGS_REMOVE_ctype.o += $(PURGATORY_CFLAGS_REMOVE) CFLAGS_ctype.o += $(PURGATORY_CFLAGS) -AFLAGS_REMOVE_entry.o += -Wa,-gdwarf-2 -AFLAGS_REMOVE_memcpy.o += -Wa,-gdwarf-2 -AFLAGS_REMOVE_memset.o += -Wa,-gdwarf-2 -AFLAGS_REMOVE_strcmp.o += -Wa,-gdwarf-2 -AFLAGS_REMOVE_strlen.o += -Wa,-gdwarf-2 -AFLAGS_REMOVE_strncmp.o += -Wa,-gdwarf-2 +asflags-remove-y += $(foreach x, -g -gdwarf-4 -gdwarf-5, $(x) -Wa,$(x)) $(obj)/purgatory.ro: $(PURGATORY_OBJS) FORCE $(call if_changed,ld) diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index 17f09dc26381..82fec66d46d2 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -69,8 +69,7 @@ CFLAGS_sha256.o += $(PURGATORY_CFLAGS) CFLAGS_REMOVE_string.o += $(PURGATORY_CFLAGS_REMOVE) CFLAGS_string.o += $(PURGATORY_CFLAGS) -AFLAGS_REMOVE_setup-x86_$(BITS).o += -Wa,-gdwarf-2 -AFLAGS_REMOVE_entry64.o += -Wa,-gdwarf-2 +asflags-remove-y += $(foreach x, -g -gdwarf-4 -gdwarf-5, $(x) -Wa,$(x)) $(obj)/purgatory.ro: $(PURGATORY_OBJS) FORCE $(call if_changed,ld) From dcc11ac9dcaffdce428794f282c100a736244b55 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Fri, 7 Apr 2023 14:42:48 -0700 Subject: [PATCH 071/175] Documentation/llvm: Add a note about prebuilt kernel.org toolchains I recently started uploading prebuilt stable versions of LLVM to kernel.org, which should make building the kernel with LLVM more accessible to maintainers and developers. Link them in the LLVM documentation to make this more visible. Link: https://lore.kernel.org/20230319235619.GA18547@dev-arch.thelio-3990X/ Suggested-by: Nick Desaulniers Reviewed-by: Bill Wendling Reviewed-by: Nick Desaulniers Signed-off-by: Nathan Chancellor Signed-off-by: Masahiro Yamada --- Documentation/kbuild/llvm.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Documentation/kbuild/llvm.rst b/Documentation/kbuild/llvm.rst index bfb51685073c..c3851fe1900d 100644 --- a/Documentation/kbuild/llvm.rst +++ b/Documentation/kbuild/llvm.rst @@ -171,6 +171,10 @@ Getting Help Getting LLVM ------------- +We provide prebuilt stable versions of LLVM on `kernel.org `_. +Below are links that may be useful for building LLVM from source or procuring +it through a distribution's package manager. + - https://releases.llvm.org/download.html - https://github.com/llvm/llvm-project - https://llvm.org/docs/GettingStarted.html From aa7d233f45b4c549750044c9921f7afcbe50925b Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 10 Apr 2023 21:09:07 +0900 Subject: [PATCH 072/175] kbuild: give up untracked files for source package builds When the source tree is dirty and contains untracked files, package builds may fail, for example, when a broken symlink exists, a file path contains whitespaces, etc. Since commit 05e96e96a315 ("kbuild: use git-archive for source package creation"), the source tarball only contains committed files because it is created by 'git archive'. scripts/package/gen-diff-patch tries to address the diff from HEAD, but including untracked files by the hand-crafted script introduces more complexity. I wrote a patch [1] to make it work in most cases, but still wonder if this is what we should aim for. To simplify the code, this patch just gives up untracked files. Going forward, it is your responsibility to do 'git add' for what you want in the source package. The script shows a warning just in case you forgot to do so. It should be checked only when building source packages. [1]: https://lore.kernel.org/all/CAK7LNAShbZ56gSh9PrbLnBDYKnjtTkHMoCXeGrhcxMvqXGq9=g@mail.gmail.com/2-0001-kbuild-make-package-builds-more-robust.patch Fixes: 05e96e96a315 ("kbuild: use git-archive for source package creation") Signed-off-by: Masahiro Yamada Reviewed-by: Nicolas Schier --- scripts/Makefile.package | 3 +- scripts/package/gen-diff-patch | 64 +++++++++----------- scripts/package/mkdebian | 103 +++++++++++++++++++-------------- scripts/package/mkspec | 11 +--- 4 files changed, 91 insertions(+), 90 deletions(-) diff --git a/scripts/Makefile.package b/scripts/Makefile.package index 61f72eb8d9be..49aff12cb6ab 100644 --- a/scripts/Makefile.package +++ b/scripts/Makefile.package @@ -94,7 +94,7 @@ binrpm-pkg: $(UTS_MACHINE)-linux -bb $(objtree)/binkernel.spec quiet_cmd_debianize = GEN $@ - cmd_debianize = $(srctree)/scripts/package/mkdebian + cmd_debianize = $(srctree)/scripts/package/mkdebian $(mkdebian-opts) debian: FORCE $(call cmd,debianize) @@ -103,6 +103,7 @@ PHONY += debian-orig debian-orig: private source = $(shell dpkg-parsechangelog -S Source) debian-orig: private version = $(shell dpkg-parsechangelog -S Version | sed 's/-[^-]*$$//') debian-orig: private orig-name = $(source)_$(version).orig.tar.gz +debian-orig: mkdebian-opts = --need-source debian-orig: linux.tar.gz debian $(Q)if [ "$(df --output=target .. 2>/dev/null)" = "$(df --output=target $< 2>/dev/null)" ]; then \ ln -f $< ../$(orig-name); \ diff --git a/scripts/package/gen-diff-patch b/scripts/package/gen-diff-patch index f842ab50a780..8a98b7bb78a0 100755 --- a/scripts/package/gen-diff-patch +++ b/scripts/package/gen-diff-patch @@ -1,44 +1,36 @@ #!/bin/sh # SPDX-License-Identifier: GPL-2.0-only -diff_patch="${1}" -untracked_patch="${2}" -srctree=$(dirname $0)/../.. +diff_patch=$1 -rm -f ${diff_patch} ${untracked_patch} +mkdir -p "$(dirname "${diff_patch}")" -if ! ${srctree}/scripts/check-git; then +git -C "${srctree:-.}" diff HEAD > "${diff_patch}" + +if [ ! -s "${diff_patch}" ] || + [ -z "$(git -C "${srctree:-.}" ls-files --other --exclude-standard | head -n1)" ]; then exit fi -mkdir -p "$(dirname ${diff_patch})" "$(dirname ${untracked_patch})" - -git -C "${srctree}" diff HEAD > "${diff_patch}" - -if [ ! -s "${diff_patch}" ]; then - rm -f "${diff_patch}" - exit -fi - -git -C ${srctree} status --porcelain --untracked-files=all | -while read stat path -do - if [ "${stat}" = '??' ]; then - - if ! diff -u /dev/null "${srctree}/${path}" > .tmp_diff && - ! head -n1 .tmp_diff | grep -q "Binary files"; then - { - echo "--- /dev/null" - echo "+++ linux/$path" - cat .tmp_diff | tail -n +3 - } >> ${untracked_patch} - fi - fi -done - -rm -f .tmp_diff - -if [ ! -s "${diff_patch}" ]; then - rm -f "${diff_patch}" - exit -fi +# The source tarball, which is generated by 'git archive', contains everything +# you committed in the repository. If you have local diff ('git diff HEAD'), +# it will go into ${diff_patch}. If untracked files are remaining, the resulting +# source package may not be correct. +# +# Examples: +# - You modified a source file to add #include "new-header.h" +# but forgot to add new-header.h +# - You modified a Makefile to add 'obj-$(CONFIG_FOO) += new-dirver.o' +# but you forgot to add new-driver.c +# +# You need to commit them, or at least stage them by 'git add'. +# +# This script does not take care of untracked files because doing so would +# introduce additional complexity. Instead, print a warning message here if +# untracked files are found. +# If all untracked files are just garbage, you can ignore this warning. +echo >&2 "============================ WARNING ============================" +echo >&2 "Your working tree has diff from HEAD, and also untracked file(s)." +echo >&2 "Please make sure you did 'git add' for all new files you need in" +echo >&2 "the source package." +echo >&2 "=================================================================" diff --git a/scripts/package/mkdebian b/scripts/package/mkdebian index e20a2b5be9eb..a4c2c2276223 100755 --- a/scripts/package/mkdebian +++ b/scripts/package/mkdebian @@ -84,7 +84,66 @@ set_debarch() { fi } +# Create debian/source/ if it is a source package build +gen_source () +{ + mkdir -p debian/source + + echo "3.0 (quilt)" > debian/source/format + + { + echo "diff-ignore" + echo "extend-diff-ignore = .*" + } > debian/source/local-options + + # Add .config as a patch + mkdir -p debian/patches + { + echo "Subject: Add .config" + echo "Author: ${maintainer}" + echo + echo "--- /dev/null" + echo "+++ linux/.config" + diff -u /dev/null "${KCONFIG_CONFIG}" | tail -n +3 + } > debian/patches/config.patch + echo config.patch > debian/patches/series + + "${srctree}/scripts/package/gen-diff-patch" debian/patches/diff.patch + if [ -s debian/patches/diff.patch ]; then + sed -i " + 1iSubject: Add local diff + 1iAuthor: ${maintainer} + 1i + " debian/patches/diff.patch + + echo diff.patch >> debian/patches/series + else + rm -f debian/patches/diff.patch + fi +} + rm -rf debian +mkdir debian + +email=${DEBEMAIL-$EMAIL} + +# use email string directly if it contains +if echo "${email}" | grep -q '<.*>'; then + maintainer=${email} +else + # or construct the maintainer string + user=${KBUILD_BUILD_USER-$(id -nu)} + name=${DEBFULLNAME-${user}} + if [ -z "${email}" ]; then + buildhost=${KBUILD_BUILD_HOST-$(hostname -f 2>/dev/null || hostname)} + email="${user}@${buildhost}" + fi + maintainer="${name} <${email}>" +fi + +if [ "$1" = --need-source ]; then + gen_source +fi # Some variables and settings used throughout the script version=$KERNELRELEASE @@ -104,22 +163,6 @@ fi debarch= set_debarch -email=${DEBEMAIL-$EMAIL} - -# use email string directly if it contains -if echo $email | grep -q '<.*>'; then - maintainer=$email -else - # or construct the maintainer string - user=${KBUILD_BUILD_USER-$(id -nu)} - name=${DEBFULLNAME-$user} - if [ -z "$email" ]; then - buildhost=${KBUILD_BUILD_HOST-$(hostname -f 2>/dev/null || hostname)} - email="$user@$buildhost" - fi - maintainer="$name <$email>" -fi - # Try to determine distribution if [ -n "$KDEB_CHANGELOG_DIST" ]; then distribution=$KDEB_CHANGELOG_DIST @@ -132,34 +175,6 @@ else echo >&2 "Install lsb-release or set \$KDEB_CHANGELOG_DIST explicitly" fi -mkdir -p debian/source/ -echo "3.0 (quilt)" > debian/source/format - -{ - echo "diff-ignore" - echo "extend-diff-ignore = .*" -} > debian/source/local-options - -# Add .config as a patch -mkdir -p debian/patches -{ - echo "Subject: Add .config" - echo "Author: ${maintainer}" - echo - echo "--- /dev/null" - echo "+++ linux/.config" - diff -u /dev/null "${KCONFIG_CONFIG}" | tail -n +3 -} > debian/patches/config -echo config > debian/patches/series - -$(dirname $0)/gen-diff-patch debian/patches/diff.patch debian/patches/untracked.patch -if [ -f debian/patches/diff.patch ]; then - echo diff.patch >> debian/patches/series -fi -if [ -f debian/patches/untracked.patch ]; then - echo untracked.patch >> debian/patches/series -fi - echo $debarch > debian/arch extra_build_depends=", $(if_enabled_echo CONFIG_UNWINDER_ORC libelf-dev:native)" extra_build_depends="$extra_build_depends, $(if_enabled_echo CONFIG_SYSTEM_TRUSTED_KEYRING libssl-dev:native)" diff --git a/scripts/package/mkspec b/scripts/package/mkspec index b7d1dc28a5d6..fc8ad3fbc0a9 100755 --- a/scripts/package/mkspec +++ b/scripts/package/mkspec @@ -19,8 +19,7 @@ else mkdir -p rpmbuild/SOURCES cp linux.tar.gz rpmbuild/SOURCES cp "${KCONFIG_CONFIG}" rpmbuild/SOURCES/config - $(dirname $0)/gen-diff-patch rpmbuild/SOURCES/diff.patch rpmbuild/SOURCES/untracked.patch - touch rpmbuild/SOURCES/diff.patch rpmbuild/SOURCES/untracked.patch + "${srctree}/scripts/package/gen-diff-patch" rpmbuild/SOURCES/diff.patch fi if grep -q CONFIG_MODULES=y include/config/auto.conf; then @@ -56,7 +55,6 @@ sed -e '/^DEL/d' -e 's/^\t*//' < Date: Sun, 9 Apr 2023 10:55:29 +0800 Subject: [PATCH 073/175] regulator: sm5703: Fix missing n_voltages for fixed regulators Set n_voltages = 1 for fixed regulators. Signed-off-by: Axel Lin Reviewed-by: Markuss Broks Link: https://lore.kernel.org/r/20230409025529.241699-1-axel.lin@ingics.com Signed-off-by: Mark Brown --- drivers/regulator/sm5703-regulator.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/regulator/sm5703-regulator.c b/drivers/regulator/sm5703-regulator.c index 05ad28fc4da8..229df7170792 100644 --- a/drivers/regulator/sm5703-regulator.c +++ b/drivers/regulator/sm5703-regulator.c @@ -42,6 +42,7 @@ static const int sm5703_buck_voltagemap[] = { .type = REGULATOR_VOLTAGE, \ .id = SM5703_USBLDO ## _id, \ .ops = &sm5703_regulator_ops_fixed, \ + .n_voltages = 1, \ .fixed_uV = SM5703_USBLDO_MICROVOLT, \ .enable_reg = SM5703_REG_USBLDO12, \ .enable_mask = SM5703_REG_EN_USBLDO ##_id, \ @@ -56,6 +57,7 @@ static const int sm5703_buck_voltagemap[] = { .type = REGULATOR_VOLTAGE, \ .id = SM5703_VBUS, \ .ops = &sm5703_regulator_ops_fixed, \ + .n_voltages = 1, \ .fixed_uV = SM5703_VBUS_MICROVOLT, \ .enable_reg = SM5703_REG_CNTL, \ .enable_mask = SM5703_OPERATION_MODE_MASK, \ From 117e4e5bd9d47b89777dbf6b37a709dcfe59520f Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Mon, 10 Apr 2023 10:35:01 -0700 Subject: [PATCH 074/175] thermal: intel: Avoid updating unsupported THERM_STATUS_CLEAR mask bits Some older processors don't allow BIT(13) and BIT(15) in the current mask set by "THERM_STATUS_CLEAR_CORE_MASK". This results in: unchecked MSR access error: WRMSR to 0x19c (tried to write 0x000000000000aaa8) at rIP: 0xffffffff816f66a6 (throttle_active_work+0xa6/0x1d0) To avoid unchecked MSR issues, check CPUID for each relevant feature and use that information to set the supported feature bits only in the "clear" mask for cores. Do the same for the analogous package mask set by "THERM_STATUS_CLEAR_PKG_MASK". Introduce functions thermal_intr_init_core_clear_mask() and thermal_intr_init_pkg_clear_mask() to set core and package mask bits, respectively. These functions are called during initialization. Fixes: 6fe1e64b6026 ("thermal: intel: Prevent accidental clearing of HFI status") Reported-by: Rui Salvaterra Link: https://lore.kernel.org/lkml/cdf43fb423368ee3994124a9e8c9b4f8d00712c6.camel@linux.intel.com/T/ Tested-by: Rui Salvaterra Signed-off-by: Srinivas Pandruvada Cc: 6.2+ # 6.2+ [ rjw: Renamed 2 funtions and 2 static variables, edited subject and changelog ] Signed-off-by: Rafael J. Wysocki --- drivers/thermal/intel/therm_throt.c | 73 ++++++++++++++++++++++++++--- 1 file changed, 66 insertions(+), 7 deletions(-) diff --git a/drivers/thermal/intel/therm_throt.c b/drivers/thermal/intel/therm_throt.c index 2e22bb82b738..e69868e868eb 100644 --- a/drivers/thermal/intel/therm_throt.c +++ b/drivers/thermal/intel/therm_throt.c @@ -193,8 +193,67 @@ static const struct attribute_group thermal_attr_group = { #define THERM_THROT_POLL_INTERVAL HZ #define THERM_STATUS_PROCHOT_LOG BIT(1) -#define THERM_STATUS_CLEAR_CORE_MASK (BIT(1) | BIT(3) | BIT(5) | BIT(7) | BIT(9) | BIT(11) | BIT(13) | BIT(15)) -#define THERM_STATUS_CLEAR_PKG_MASK (BIT(1) | BIT(3) | BIT(5) | BIT(7) | BIT(9) | BIT(11)) +static u64 therm_intr_core_clear_mask; +static u64 therm_intr_pkg_clear_mask; + +static void thermal_intr_init_core_clear_mask(void) +{ + if (therm_intr_core_clear_mask) + return; + + /* + * Reference: Intel SDM Volume 4 + * "Table 2-2. IA-32 Architectural MSRs", MSR 0x19C + * IA32_THERM_STATUS. + */ + + /* + * Bit 1, 3, 5: CPUID.01H:EDX[22] = 1. This driver will not + * enable interrupts, when 0 as it checks for X86_FEATURE_ACPI. + */ + therm_intr_core_clear_mask = (BIT(1) | BIT(3) | BIT(5)); + + /* + * Bit 7 and 9: Thermal Threshold #1 and #2 log + * If CPUID.01H:ECX[8] = 1 + */ + if (boot_cpu_has(X86_FEATURE_TM2)) + therm_intr_core_clear_mask |= (BIT(7) | BIT(9)); + + /* Bit 11: Power Limitation log (R/WC0) If CPUID.06H:EAX[4] = 1 */ + if (boot_cpu_has(X86_FEATURE_PLN)) + therm_intr_core_clear_mask |= BIT(11); + + /* + * Bit 13: Current Limit log (R/WC0) If CPUID.06H:EAX[7] = 1 + * Bit 15: Cross Domain Limit log (R/WC0) If CPUID.06H:EAX[7] = 1 + */ + if (boot_cpu_has(X86_FEATURE_HWP)) + therm_intr_core_clear_mask |= (BIT(13) | BIT(15)); +} + +static void thermal_intr_init_pkg_clear_mask(void) +{ + if (therm_intr_pkg_clear_mask) + return; + + /* + * Reference: Intel SDM Volume 4 + * "Table 2-2. IA-32 Architectural MSRs", MSR 0x1B1 + * IA32_PACKAGE_THERM_STATUS. + */ + + /* All bits except BIT 26 depend on CPUID.06H: EAX[6] = 1 */ + if (boot_cpu_has(X86_FEATURE_PTS)) + therm_intr_pkg_clear_mask = (BIT(1) | BIT(3) | BIT(5) | BIT(7) | BIT(9) | BIT(11)); + + /* + * Intel SDM Volume 2A: Thermal and Power Management Leaf + * Bit 26: CPUID.06H: EAX[19] = 1 + */ + if (boot_cpu_has(X86_FEATURE_HFI)) + therm_intr_pkg_clear_mask |= BIT(26); +} /* * Clear the bits in package thermal status register for bit = 1 @@ -207,13 +266,10 @@ void thermal_clear_package_intr_status(int level, u64 bit_mask) if (level == CORE_LEVEL) { msr = MSR_IA32_THERM_STATUS; - msr_val = THERM_STATUS_CLEAR_CORE_MASK; + msr_val = therm_intr_core_clear_mask; } else { msr = MSR_IA32_PACKAGE_THERM_STATUS; - msr_val = THERM_STATUS_CLEAR_PKG_MASK; - if (boot_cpu_has(X86_FEATURE_HFI)) - msr_val |= BIT(26); - + msr_val = therm_intr_pkg_clear_mask; } msr_val &= ~bit_mask; @@ -708,6 +764,9 @@ void intel_init_thermal(struct cpuinfo_x86 *c) h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; apic_write(APIC_LVTTHMR, h); + thermal_intr_init_core_clear_mask(); + thermal_intr_init_pkg_clear_mask(); + rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable) wrmsr(MSR_IA32_THERM_INTERRUPT, From 4654e9f9f43993eb9ce383fa7c88d14b052b8cc3 Mon Sep 17 00:00:00 2001 From: Wyes Karny Date: Thu, 30 Mar 2023 14:13:14 +0000 Subject: [PATCH 075/175] amd-pstate: Fix amd_pstate mode switch amd_pstate mode can be changed by writing the mode name to the `status` sysfs. But some combinations are not working. Fix this issue by taking care of the edge cases. Before the fix the mode change combination test fails: #./pst_test.sh Test passed: from: disable, to Test passed: from: disable, to disable Test failed: 1, From mode: disable, to mode: passive Test failed: 1, From mode: disable, to mode: active Test failed: 1, From mode: passive, to mode: active Test passed: from: passive, to disable Test failed: 1, From mode: passive, to mode: passive Test failed: 1, From mode: passive, to mode: active Test failed: 1, From mode: active, to mode: active Test passed: from: active, to disable Test failed: 1, From mode: active, to mode: passive Test failed: 1, From mode: active, to mode: active After the fix test passes: #./pst_test.sh Test passed: from: disable, to Test passed: from: disable, to disable Test passed: from: disable, to passive Test passed: from: disable, to active Test passed: from: passive, to active Test passed: from: passive, to disable Test passed: from: passive, to passive Test passed: from: passive, to active Test passed: from: active, to active Test passed: from: active, to disable Test passed: from: active, to passive Test passed: from: active, to active Fixes: abd61c08ef349 ("cpufreq: amd-pstate: add driver working mode switch support") Acked-by: Huang Rui Reviewed-by: Alexey Kardashevskiy Signed-off-by: Wyes Karny Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/amd-pstate.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 73c7643b2697..8dd46fad151e 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -840,22 +840,20 @@ static int amd_pstate_update_status(const char *buf, size_t size) switch(mode_idx) { case AMD_PSTATE_DISABLE: - if (!current_pstate_driver) - return -EINVAL; - if (cppc_state == AMD_PSTATE_ACTIVE) - return -EBUSY; - cpufreq_unregister_driver(current_pstate_driver); - amd_pstate_driver_cleanup(); + if (current_pstate_driver) { + cpufreq_unregister_driver(current_pstate_driver); + amd_pstate_driver_cleanup(); + } break; case AMD_PSTATE_PASSIVE: if (current_pstate_driver) { if (current_pstate_driver == &amd_pstate_driver) return 0; cpufreq_unregister_driver(current_pstate_driver); - cppc_state = AMD_PSTATE_PASSIVE; - current_pstate_driver = &amd_pstate_driver; } + current_pstate_driver = &amd_pstate_driver; + cppc_state = AMD_PSTATE_PASSIVE; ret = cpufreq_register_driver(current_pstate_driver); break; case AMD_PSTATE_ACTIVE: @@ -863,10 +861,10 @@ static int amd_pstate_update_status(const char *buf, size_t size) if (current_pstate_driver == &amd_pstate_epp_driver) return 0; cpufreq_unregister_driver(current_pstate_driver); - current_pstate_driver = &amd_pstate_epp_driver; - cppc_state = AMD_PSTATE_ACTIVE; } + current_pstate_driver = &amd_pstate_epp_driver; + cppc_state = AMD_PSTATE_ACTIVE; ret = cpufreq_register_driver(current_pstate_driver); break; default: From 05cda427126f30ce3fc8ffd82fd6f5196398d502 Mon Sep 17 00:00:00 2001 From: Paul Menzel Date: Tue, 11 Apr 2023 20:31:44 +0200 Subject: [PATCH 076/175] ACPI: resource: Skip IRQ override on ASUS ExpertBook B1502CBA Like the ASUS ExpertBook B2502CBA and various ASUS Vivobook laptops, the ASUS ExpertBook B1502CBA has an ACPI DSDT table that describes IRQ 1 as ActiveLow while the kernel overrides it to Edge_High. $ sudo dmesg | grep DMI DMI: ASUSTeK COMPUTER INC. ASUS EXPERTBOOK B1502CBA_B1502CBA/B1502CBA, BIOS B1502CBA.300 01/18/2023 $ grep -A 40 PS2K dsdt.dsl | grep IRQ -A 1 IRQ (Level, ActiveLow, Exclusive, ) {1} This prevents the keyboard from working. To fix this issue, add this laptop to the skip_override_table so that the kernel does not override IRQ 1. Link: https://bugzilla.kernel.org/show_bug.cgi?id=217323 Signed-off-by: Paul Menzel Signed-off-by: Rafael J. Wysocki --- drivers/acpi/resource.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/acpi/resource.c b/drivers/acpi/resource.c index 7b4801ce62d6..e8492b3a393a 100644 --- a/drivers/acpi/resource.c +++ b/drivers/acpi/resource.c @@ -439,6 +439,13 @@ static const struct dmi_system_id asus_laptop[] = { DMI_MATCH(DMI_BOARD_NAME, "S5602ZA"), }, }, + { + .ident = "Asus ExpertBook B1502CBA", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."), + DMI_MATCH(DMI_BOARD_NAME, "B1502CBA"), + }, + }, { .ident = "Asus ExpertBook B2402CBA", .matches = { From 8d736482749f6d350892ef83a7a11d43cd49981e Mon Sep 17 00:00:00 2001 From: Mathis Salmen Date: Thu, 6 Apr 2023 12:11:31 +0200 Subject: [PATCH 077/175] riscv: add icache flush for nommu sigreturn trampoline In a NOMMU kernel, sigreturn trampolines are generated on the user stack by setup_rt_frame. Currently, these trampolines are not instruction fenced, thus their visibility to ifetch is not guaranteed. This patch adds a flush_icache_range in setup_rt_frame to fix this problem. Signed-off-by: Mathis Salmen Fixes: 6bd33e1ece52 ("riscv: add nommu support") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20230406101130.82304-1-mathis.salmen@matsal.de Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/signal.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/riscv/kernel/signal.c b/arch/riscv/kernel/signal.c index bfb2afa4135f..dee66c9290cc 100644 --- a/arch/riscv/kernel/signal.c +++ b/arch/riscv/kernel/signal.c @@ -19,6 +19,7 @@ #include #include #include +#include extern u32 __user_rt_sigreturn[2]; @@ -181,6 +182,7 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set, { struct rt_sigframe __user *frame; long err = 0; + unsigned long __maybe_unused addr; frame = get_sigframe(ksig, regs, sizeof(*frame)); if (!access_ok(frame, sizeof(*frame))) @@ -209,7 +211,12 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set, if (copy_to_user(&frame->sigreturn_code, __user_rt_sigreturn, sizeof(frame->sigreturn_code))) return -EFAULT; - regs->ra = (unsigned long)&frame->sigreturn_code; + + addr = (unsigned long)&frame->sigreturn_code; + /* Make sure the two instructions are pushed to icache. */ + flush_icache_range(addr, addr + sizeof(frame->sigreturn_code)); + + regs->ra = addr; #endif /* CONFIG_MMU */ /* From c8e22b7a1694bb8d025ea636816472739d859145 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Tue, 4 Apr 2023 21:23:42 +0200 Subject: [PATCH 078/175] scsi: ses: Handle enclosure with just a primary component gracefully This reverts commit 3fe97ff3d949 ("scsi: ses: Don't attach if enclosure has no components") and introduces proper handling of case where there are no detected secondary components, but primary component (enumerated in num_enclosures) does exist. That fix was originally proposed by Ding Hui . Completely ignoring devices that have one primary enclosure and no secondary one results in ses_intf_add() bailing completely scsi 2:0:0:254: enclosure has no enumerated components scsi 2:0:0:254: Failed to bind enclosure -12ven in valid configurations such even on valid configurations with 1 primary and 0 secondary enclosures as below: # sg_ses /dev/sg0 3PARdata SES 3321 Supported diagnostic pages: Supported Diagnostic Pages [sdp] [0x0] Configuration (SES) [cf] [0x1] Short Enclosure Status (SES) [ses] [0x8] # sg_ses -p cf /dev/sg0 3PARdata SES 3321 Configuration diagnostic page: number of secondary subenclosures: 0 generation code: 0x0 enclosure descriptor list Subenclosure identifier: 0 [primary] relative ES process id: 0, number of ES processes: 1 number of type descriptor headers: 1 enclosure logical identifier (hex): 20000002ac02068d enclosure vendor: 3PARdata product: VV rev: 3321 type descriptor header and text list Element type: Unspecified, subenclosure id: 0 number of possible elements: 1 The changelog for the original fix follows ===== We can get a crash when disconnecting the iSCSI session, the call trace like this: [ffff00002a00fb70] kfree at ffff00000830e224 [ffff00002a00fba0] ses_intf_remove at ffff000001f200e4 [ffff00002a00fbd0] device_del at ffff0000086b6a98 [ffff00002a00fc50] device_unregister at ffff0000086b6d58 [ffff00002a00fc70] __scsi_remove_device at ffff00000870608c [ffff00002a00fca0] scsi_remove_device at ffff000008706134 [ffff00002a00fcc0] __scsi_remove_target at ffff0000087062e4 [ffff00002a00fd10] scsi_remove_target at ffff0000087064c0 [ffff00002a00fd70] __iscsi_unbind_session at ffff000001c872c4 [ffff00002a00fdb0] process_one_work at ffff00000810f35c [ffff00002a00fe00] worker_thread at ffff00000810f648 [ffff00002a00fe70] kthread at ffff000008116e98 In ses_intf_add, components count could be 0, and kcalloc 0 size scomp, but not saved in edev->component[i].scratch In this situation, edev->component[0].scratch is an invalid pointer, when kfree it in ses_intf_remove_enclosure, a crash like above would happen The call trace also could be other random cases when kfree cannot catch the invalid pointer We should not use edev->component[] array when the components count is 0 We also need check index when use edev->component[] array in ses_enclosure_data_process ===== Reported-by: Michal Kolar Originally-by: Ding Hui Cc: stable@vger.kernel.org Fixes: 3fe97ff3d949 ("scsi: ses: Don't attach if enclosure has no components") Signed-off-by: Jiri Kosina Link: https://lore.kernel.org/r/nycvar.YFH.7.76.2304042122270.29760@cbobk.fhfr.pm Tested-by: Michal Kolar Signed-off-by: Martin K. Petersen --- drivers/scsi/ses.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/drivers/scsi/ses.c b/drivers/scsi/ses.c index b11a9162e73a..b54f2c6c08c3 100644 --- a/drivers/scsi/ses.c +++ b/drivers/scsi/ses.c @@ -509,9 +509,6 @@ static int ses_enclosure_find_by_addr(struct enclosure_device *edev, int i; struct ses_component *scomp; - if (!edev->component[0].scratch) - return 0; - for (i = 0; i < edev->components; i++) { scomp = edev->component[i].scratch; if (scomp->addr != efd->addr) @@ -602,8 +599,10 @@ static void ses_enclosure_data_process(struct enclosure_device *edev, components++, type_ptr[0], name); - else + else if (components < edev->components) ecomp = &edev->component[components++]; + else + ecomp = ERR_PTR(-EINVAL); if (!IS_ERR(ecomp)) { if (addl_desc_ptr) { @@ -734,11 +733,6 @@ static int ses_intf_add(struct device *cdev, components += type_ptr[1]; } - if (components == 0) { - sdev_printk(KERN_WARNING, sdev, "enclosure has no enumerated components\n"); - goto err_free; - } - ses_dev->page1 = buf; ses_dev->page1_len = len; buf = NULL; @@ -780,9 +774,11 @@ static int ses_intf_add(struct device *cdev, buf = NULL; } page2_not_supported: - scomp = kcalloc(components, sizeof(struct ses_component), GFP_KERNEL); - if (!scomp) - goto err_free; + if (components > 0) { + scomp = kcalloc(components, sizeof(struct ses_component), GFP_KERNEL); + if (!scomp) + goto err_free; + } edev = enclosure_register(cdev->parent, dev_name(&sdev->sdev_gendev), components, &ses_enclosure_callbacks); From 91dcf1e8068e9a8823e419a7a34ff4341275fb70 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 11 Apr 2023 11:06:11 +0200 Subject: [PATCH 079/175] sched/fair: Fix imbalance overflow When local group is fully busy but its average load is above system load, computing the imbalance will overflow and local group is not the best target for pulling this load. Fixes: 0b0695f2b34a ("sched/fair: Rework load_balance()") Reported-by: Tingjia Cao Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Tested-by: Tingjia Cao Link: https://lore.kernel.org/lkml/CABcWv9_DAhVBOq2=W=2ypKE9dKM5s2DvoV8-U0+GDwwuKZ89jQ@mail.gmail.com/T/ --- kernel/sched/fair.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6986ea31c984..5f6587d94c1d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10238,6 +10238,16 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) / sds->total_capacity; + + /* + * If the local group is more loaded than the average system + * load, don't try to pull any tasks. + */ + if (local->avg_load >= sds->avg_load) { + env->imbalance = 0; + return; + } + } /* From 57dcd64c7e036299ef526b400a8d12b8a2352f26 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 5 Apr 2023 22:15:32 +0900 Subject: [PATCH 080/175] cgroup,freezer: hold cpu_hotplug_lock before freezer_mutex syzbot is reporting circular locking dependency between cpu_hotplug_lock and freezer_mutex, for commit f5d39b020809 ("freezer,sched: Rewrite core freezer logic") replaced atomic_inc() in freezer_apply_state() with static_branch_inc() which holds cpu_hotplug_lock. cpu_hotplug_lock => cgroup_threadgroup_rwsem => freezer_mutex cgroup_file_write() { cgroup_procs_write() { __cgroup_procs_write() { cgroup_procs_write_start() { cgroup_attach_lock() { cpus_read_lock() { percpu_down_read(&cpu_hotplug_lock); } percpu_down_write(&cgroup_threadgroup_rwsem); } } cgroup_attach_task() { cgroup_migrate() { cgroup_migrate_execute() { freezer_attach() { mutex_lock(&freezer_mutex); (...snipped...) } } } } (...snipped...) } } } freezer_mutex => cpu_hotplug_lock cgroup_file_write() { freezer_write() { freezer_change_state() { mutex_lock(&freezer_mutex); freezer_apply_state() { static_branch_inc(&freezer_active) { static_key_slow_inc() { cpus_read_lock(); static_key_slow_inc_cpuslocked(); cpus_read_unlock(); } } } mutex_unlock(&freezer_mutex); } } } Swap locking order by moving cpus_read_lock() in freezer_apply_state() to before mutex_lock(&freezer_mutex) in freezer_change_state(). Reported-by: syzbot Link: https://syzkaller.appspot.com/bug?extid=c39682e86c9d84152f93 Suggested-by: Hillf Danton Fixes: f5d39b020809 ("freezer,sched: Rewrite core freezer logic") Signed-off-by: Tetsuo Handa Acked-by: Peter Zijlstra (Intel) Reviewed-by: Mukesh Ojha Signed-off-by: Tejun Heo --- kernel/cgroup/legacy_freezer.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c index 1b6b21851e9d..936473203a6b 100644 --- a/kernel/cgroup/legacy_freezer.c +++ b/kernel/cgroup/legacy_freezer.c @@ -22,6 +22,7 @@ #include #include #include +#include /* * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is @@ -350,7 +351,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze, if (freeze) { if (!(freezer->state & CGROUP_FREEZING)) - static_branch_inc(&freezer_active); + static_branch_inc_cpuslocked(&freezer_active); freezer->state |= state; freeze_cgroup(freezer); } else { @@ -361,7 +362,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze, if (!(freezer->state & CGROUP_FREEZING)) { freezer->state &= ~CGROUP_FROZEN; if (was_freezing) - static_branch_dec(&freezer_active); + static_branch_dec_cpuslocked(&freezer_active); unfreeze_cgroup(freezer); } } @@ -379,6 +380,7 @@ static void freezer_change_state(struct freezer *freezer, bool freeze) { struct cgroup_subsys_state *pos; + cpus_read_lock(); /* * Update all its descendants in pre-order traversal. Each * descendant will try to inherit its parent's FREEZING state as @@ -407,6 +409,7 @@ static void freezer_change_state(struct freezer *freezer, bool freeze) } rcu_read_unlock(); mutex_unlock(&freezer_mutex); + cpus_read_unlock(); } static ssize_t freezer_write(struct kernfs_open_file *of, From ba9182a89626d5f83c2ee4594f55cb9c1e60f0e2 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 11 Apr 2023 09:35:57 -0400 Subject: [PATCH 081/175] cgroup/cpuset: Wake up cpuset_attach_wq tasks in cpuset_cancel_attach() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After a successful cpuset_can_attach() call which increments the attach_in_progress flag, either cpuset_cancel_attach() or cpuset_attach() will be called later. In cpuset_attach(), tasks in cpuset_attach_wq, if present, will be woken up at the end. That is not the case in cpuset_cancel_attach(). So missed wakeup is possible if the attach operation is somehow cancelled. Fix that by doing the wakeup in cpuset_cancel_attach() as well. Fixes: e44193d39e8d ("cpuset: let hotplug propagation work wait for task attaching") Signed-off-by: Waiman Long Reviewed-by: Michal Koutný Cc: stable@vger.kernel.org # v3.11+ Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index f310915d1257..ff7eb8e2efdc 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2502,11 +2502,15 @@ out_unlock: static void cpuset_cancel_attach(struct cgroup_taskset *tset) { struct cgroup_subsys_state *css; + struct cpuset *cs; cgroup_taskset_first(tset, &css); + cs = css_cs(css); percpu_down_write(&cpuset_rwsem); - css_cs(css)->attach_in_progress--; + cs->attach_in_progress--; + if (!cs->attach_in_progress) + wake_up(&cpuset_attach_wq); percpu_up_write(&cpuset_rwsem); } From 42a11bf5c5436e91b040aeb04063be1710bb9f9c Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 11 Apr 2023 09:35:58 -0400 Subject: [PATCH 082/175] cgroup/cpuset: Make cpuset_fork() handle CLONE_INTO_CGROUP properly By default, the clone(2) syscall spawn a child process into the same cgroup as its parent. With the use of the CLONE_INTO_CGROUP flag introduced by commit ef2c41cf38a7 ("clone3: allow spawning processes into cgroups"), the child will be spawned into a different cgroup which is somewhat similar to writing the child's tid into "cgroup.threads". The current cpuset_fork() method does not properly handle the CLONE_INTO_CGROUP case where the cpuset of the child may be different from that of its parent. Update the cpuset_fork() method to treat the CLONE_INTO_CGROUP case similar to cpuset_attach(). Since the newly cloned task has not been running yet, its actual memory usage isn't known. So it is not necessary to make change to mm in cpuset_fork(). Fixes: ef2c41cf38a7 ("clone3: allow spawning processes into cgroups") Reported-by: Giuseppe Scrivano Signed-off-by: Waiman Long Cc: stable@vger.kernel.org # v5.7+ Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 64 ++++++++++++++++++++++++++++-------------- 1 file changed, 43 insertions(+), 21 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index ff7eb8e2efdc..2ccfae74acf9 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2515,16 +2515,33 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset) } /* - * Protected by cpuset_rwsem. cpus_attach is used only by cpuset_attach() + * Protected by cpuset_rwsem. cpus_attach is used only by cpuset_attach_task() * but we can't allocate it dynamically there. Define it global and * allocate from cpuset_init(). */ static cpumask_var_t cpus_attach; +static nodemask_t cpuset_attach_nodemask_to; + +static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task) +{ + percpu_rwsem_assert_held(&cpuset_rwsem); + + if (cs != &top_cpuset) + guarantee_online_cpus(task, cpus_attach); + else + cpumask_copy(cpus_attach, task_cpu_possible_mask(task)); + /* + * can_attach beforehand should guarantee that this doesn't + * fail. TODO: have a better way to handle failure here + */ + WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); + + cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); + cpuset_update_task_spread_flags(cs, task); +} static void cpuset_attach(struct cgroup_taskset *tset) { - /* static buf protected by cpuset_rwsem */ - static nodemask_t cpuset_attach_nodemask_to; struct task_struct *task; struct task_struct *leader; struct cgroup_subsys_state *css; @@ -2555,20 +2572,8 @@ static void cpuset_attach(struct cgroup_taskset *tset) guarantee_online_mems(cs, &cpuset_attach_nodemask_to); - cgroup_taskset_for_each(task, css, tset) { - if (cs != &top_cpuset) - guarantee_online_cpus(task, cpus_attach); - else - cpumask_copy(cpus_attach, task_cpu_possible_mask(task)); - /* - * can_attach beforehand should guarantee that this doesn't - * fail. TODO: have a better way to handle failure here - */ - WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); - - cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); - cpuset_update_task_spread_flags(cs, task); - } + cgroup_taskset_for_each(task, css, tset) + cpuset_attach_task(cs, task); /* * Change mm for all threadgroup leaders. This is expensive and may @@ -3266,11 +3271,28 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) */ static void cpuset_fork(struct task_struct *task) { - if (task_css_is_root(task, cpuset_cgrp_id)) - return; + struct cpuset *cs; + bool same_cs; - set_cpus_allowed_ptr(task, current->cpus_ptr); - task->mems_allowed = current->mems_allowed; + rcu_read_lock(); + cs = task_cs(task); + same_cs = (cs == task_cs(current)); + rcu_read_unlock(); + + if (same_cs) { + if (cs == &top_cpuset) + return; + + set_cpus_allowed_ptr(task, current->cpus_ptr); + task->mems_allowed = current->mems_allowed; + return; + } + + /* CLONE_INTO_CGROUP */ + percpu_down_write(&cpuset_rwsem); + guarantee_online_mems(cs, &cpuset_attach_nodemask_to); + cpuset_attach_task(cs, task); + percpu_up_write(&cpuset_rwsem); } struct cgroup_subsys cpuset_cgrp_subsys = { From eee87853794187f6adbe19533ed79c8b44b36a91 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 11 Apr 2023 09:35:59 -0400 Subject: [PATCH 083/175] cgroup/cpuset: Add cpuset_can_fork() and cpuset_cancel_fork() methods MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the case of CLONE_INTO_CGROUP, not all cpusets are ready to accept new tasks. It is too late to check that in cpuset_fork(). So we need to add the cpuset_can_fork() and cpuset_cancel_fork() methods to pre-check it before we can allow attachment to a different cpuset. We also need to set the attach_in_progress flag to alert other code that a new task is going to be added to the cpuset. Fixes: ef2c41cf38a7 ("clone3: allow spawning processes into cgroups") Suggested-by: Michal Koutný Signed-off-by: Waiman Long Cc: stable@vger.kernel.org # v5.7+ Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 97 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 86 insertions(+), 11 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 2ccfae74acf9..166a45019f66 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2453,6 +2453,20 @@ static int fmeter_getrate(struct fmeter *fmp) static struct cpuset *cpuset_attach_old_cs; +/* + * Check to see if a cpuset can accept a new task + * For v1, cpus_allowed and mems_allowed can't be empty. + * For v2, effective_cpus can't be empty. + * Note that in v1, effective_cpus = cpus_allowed. + */ +static int cpuset_can_attach_check(struct cpuset *cs) +{ + if (cpumask_empty(cs->effective_cpus) || + (!is_in_v2_mode() && nodes_empty(cs->mems_allowed))) + return -ENOSPC; + return 0; +} + /* Called by cgroups to determine if a cpuset is usable; cpuset_rwsem held */ static int cpuset_can_attach(struct cgroup_taskset *tset) { @@ -2467,16 +2481,9 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) percpu_down_write(&cpuset_rwsem); - /* allow moving tasks into an empty cpuset if on default hierarchy */ - ret = -ENOSPC; - if (!is_in_v2_mode() && - (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) - goto out_unlock; - - /* - * Task cannot be moved to a cpuset with empty effective cpus. - */ - if (cpumask_empty(cs->effective_cpus)) + /* Check to see if task is allowed in the cpuset */ + ret = cpuset_can_attach_check(cs); + if (ret) goto out_unlock; cgroup_taskset_for_each(task, css, tset) { @@ -2493,7 +2500,6 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) * changes which zero cpus/mems_allowed. */ cs->attach_in_progress++; - ret = 0; out_unlock: percpu_up_write(&cpuset_rwsem); return ret; @@ -3264,6 +3270,68 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) percpu_up_write(&cpuset_rwsem); } +/* + * In case the child is cloned into a cpuset different from its parent, + * additional checks are done to see if the move is allowed. + */ +static int cpuset_can_fork(struct task_struct *task, struct css_set *cset) +{ + struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]); + bool same_cs; + int ret; + + rcu_read_lock(); + same_cs = (cs == task_cs(current)); + rcu_read_unlock(); + + if (same_cs) + return 0; + + lockdep_assert_held(&cgroup_mutex); + percpu_down_write(&cpuset_rwsem); + + /* Check to see if task is allowed in the cpuset */ + ret = cpuset_can_attach_check(cs); + if (ret) + goto out_unlock; + + ret = task_can_attach(task, cs->effective_cpus); + if (ret) + goto out_unlock; + + ret = security_task_setscheduler(task); + if (ret) + goto out_unlock; + + /* + * Mark attach is in progress. This makes validate_change() fail + * changes which zero cpus/mems_allowed. + */ + cs->attach_in_progress++; +out_unlock: + percpu_up_write(&cpuset_rwsem); + return ret; +} + +static void cpuset_cancel_fork(struct task_struct *task, struct css_set *cset) +{ + struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]); + bool same_cs; + + rcu_read_lock(); + same_cs = (cs == task_cs(current)); + rcu_read_unlock(); + + if (same_cs) + return; + + percpu_down_write(&cpuset_rwsem); + cs->attach_in_progress--; + if (!cs->attach_in_progress) + wake_up(&cpuset_attach_wq); + percpu_up_write(&cpuset_rwsem); +} + /* * Make sure the new task conform to the current state of its parent, * which could have been changed by cpuset just after it inherits the @@ -3292,6 +3360,11 @@ static void cpuset_fork(struct task_struct *task) percpu_down_write(&cpuset_rwsem); guarantee_online_mems(cs, &cpuset_attach_nodemask_to); cpuset_attach_task(cs, task); + + cs->attach_in_progress--; + if (!cs->attach_in_progress) + wake_up(&cpuset_attach_wq); + percpu_up_write(&cpuset_rwsem); } @@ -3305,6 +3378,8 @@ struct cgroup_subsys cpuset_cgrp_subsys = { .attach = cpuset_attach, .post_attach = cpuset_post_attach, .bind = cpuset_bind, + .can_fork = cpuset_can_fork, + .cancel_fork = cpuset_cancel_fork, .fork = cpuset_fork, .legacy_cftypes = legacy_files, .dfl_cftypes = dfl_files, From 7e27cb6ad4d85fc8bac2a2a896da62ef66b8598e Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 11 Apr 2023 09:36:00 -0400 Subject: [PATCH 084/175] cgroup/cpuset: Make cpuset_attach_task() skip subpartitions CPUs for top_cpuset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is found that attaching a task to the top_cpuset does not currently ignore CPUs allocated to subpartitions in cpuset_attach_task(). So the code is changed to fix that. Signed-off-by: Waiman Long Reviewed-by: Michal Koutný Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 166a45019f66..505d86b16642 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2535,7 +2535,8 @@ static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task) if (cs != &top_cpuset) guarantee_online_cpus(task, cpus_attach); else - cpumask_copy(cpus_attach, task_cpu_possible_mask(task)); + cpumask_andnot(cpus_attach, task_cpu_possible_mask(task), + cs->subparts_cpus); /* * can_attach beforehand should guarantee that this doesn't * fail. TODO: have a better way to handle failure here From 8eda19cd59cedbfe4ec11aea4bcecabe4c98e9e4 Mon Sep 17 00:00:00 2001 From: Stefan Binding Date: Wed, 12 Apr 2023 17:05:31 +0100 Subject: [PATCH 085/175] ALSA: hda/realtek: Add quirks for Lenovo Z13/Z16 Gen2 These Lenovo laptops use Realtek HDA codec combined with 2xCS35L41 Amplifiers using I2C with External Boost. Signed-off-by: Stefan Binding Cc: Link: https://lore.kernel.org/r/20230412160531.182007-1-sbinding@opensource.cirrus.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index f554db1e7e9a..3b9f077a227f 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -9689,6 +9689,9 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x17aa, 0x22f1, "Thinkpad", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x17aa, 0x22f2, "Thinkpad", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x17aa, 0x22f3, "Thinkpad", ALC287_FIXUP_CS35L41_I2C_2), + SND_PCI_QUIRK(0x17aa, 0x2318, "Thinkpad Z13 Gen2", ALC287_FIXUP_CS35L41_I2C_2), + SND_PCI_QUIRK(0x17aa, 0x2319, "Thinkpad Z16 Gen2", ALC287_FIXUP_CS35L41_I2C_2), + SND_PCI_QUIRK(0x17aa, 0x231a, "Thinkpad Z16 Gen2", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x17aa, 0x30bb, "ThinkCentre AIO", ALC233_FIXUP_LENOVO_LINE2_MIC_HOTKEY), SND_PCI_QUIRK(0x17aa, 0x30e2, "ThinkCentre AIO", ALC233_FIXUP_LENOVO_LINE2_MIC_HOTKEY), SND_PCI_QUIRK(0x17aa, 0x310c, "ThinkCentre Station", ALC294_FIXUP_LENOVO_MIC_LOCATION), From 775d3c514c5b2763a50ab7839026d7561795924d Mon Sep 17 00:00:00 2001 From: Matija Glavinic Pecotic Date: Thu, 6 Apr 2023 08:26:52 +0200 Subject: [PATCH 086/175] x86/rtc: Remove __init for runtime functions set_rtc_noop(), get_rtc_noop() are after booting, therefore their __init annotation is wrong. A crash was observed on an x86 platform where CMOS RTC is unused and disabled via device tree. set_rtc_noop() was invoked from ntp: sync_hw_clock(), although CONFIG_RTC_SYSTOHC=n, however sync_cmos_clock() doesn't honour that. Workqueue: events_power_efficient sync_hw_clock RIP: 0010:set_rtc_noop Call Trace: update_persistent_clock64 sync_hw_clock Fix this by dropping the __init annotation from set/get_rtc_noop(). Fixes: c311ed6183f4 ("x86/init: Allow DT configured systems to disable RTC at boot time") Signed-off-by: Matija Glavinic Pecotic Signed-off-by: Thomas Gleixner Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/59f7ceb1-446b-1d3d-0bc8-1f0ee94b1e18@nokia.com --- arch/x86/kernel/x86_init.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index ef80d361b463..10622cf2b30f 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -33,8 +33,8 @@ static int __init iommu_init_noop(void) { return 0; } static void iommu_shutdown_noop(void) { } bool __init bool_x86_init_noop(void) { return false; } void x86_op_int_noop(int cpu) { } -static __init int set_rtc_noop(const struct timespec64 *now) { return -EINVAL; } -static __init void get_rtc_noop(struct timespec64 *now) { } +static int set_rtc_noop(const struct timespec64 *now) { return -EINVAL; } +static void get_rtc_noop(struct timespec64 *now) { } static __initconst const struct of_device_id of_cmos_match[] = { { .compatible = "motorola,mc146818" }, From c8bc34660628769f71e8b230144a2a4d86ab0f91 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 13 Apr 2023 14:51:56 +0100 Subject: [PATCH 087/175] sunrpc: Fix RFC6803 encryption test The usage_data[] array in rfc6803_encrypt_case() is uninitialised, so clear it as it may cause the tests to fail otherwise. Fixes: b958cff6b27b ("SUNRPC: Add encryption KUnit tests for the RFC 6803 encryption types") Link: https://lore.kernel.org/r/380323.1681314997@warthog.procyon.org.uk/ Signed-off-by: David Howells cc: Chuck Lever cc: Scott Mayhew cc: Herbert Xu cc: linux-nfs@vger.kernel.org cc: linux-crypto@vger.kernel.org Signed-off-by: Chuck Lever --- net/sunrpc/auth_gss/gss_krb5_test.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/sunrpc/auth_gss/gss_krb5_test.c b/net/sunrpc/auth_gss/gss_krb5_test.c index ce0541e32fc9..aa6ec4e858aa 100644 --- a/net/sunrpc/auth_gss/gss_krb5_test.c +++ b/net/sunrpc/auth_gss/gss_krb5_test.c @@ -1327,6 +1327,7 @@ static void rfc6803_encrypt_case(struct kunit *test) if (!gk5e) kunit_skip(test, "Encryption type is not available"); + memset(usage_data, 0, sizeof(usage_data)); usage.data[3] = param->constant; Ke.len = gk5e->Ke_length; From aca3b0fa3d04b40c96934d86cc224cccfa7ea8e0 Mon Sep 17 00:00:00 2001 From: Saravanan Vajravel Date: Fri, 31 Mar 2023 23:34:24 -0700 Subject: [PATCH 088/175] RDMA/core: Fix GID entry ref leak when create_ah fails If AH create request fails, release sgid_attr to avoid GID entry referrence leak reported while releasing GID table Fixes: 1a1f460ff151 ("RDMA: Hold the sgid_attr inside the struct ib_ah/qp") Link: https://lore.kernel.org/r/20230401063424.342204-1-saravanan.vajravel@broadcom.com Reviewed-by: Selvin Xavier Signed-off-by: Saravanan Vajravel Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/verbs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 11b1c1603aeb..b99b3cc283b6 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -532,6 +532,8 @@ static struct ib_ah *_rdma_create_ah(struct ib_pd *pd, else ret = device->ops.create_ah(ah, &init_attr, NULL); if (ret) { + if (ah->sgid_attr) + rdma_put_gid_attr(ah->sgid_attr); kfree(ah); return ERR_PTR(ret); } From f8160d3b35fc94491bb0cb974dbda310ef96c0e2 Mon Sep 17 00:00:00 2001 From: Gregor Herburger Date: Thu, 13 Apr 2023 11:37:37 +0200 Subject: [PATCH 089/175] i2c: ocores: generate stop condition after timeout in polling mode In polling mode, no stop condition is generated after a timeout. This causes SCL to remain low and thereby block the bus. If this happens during a transfer it can cause slaves to misinterpret the subsequent transfer and return wrong values. To solve this, pass the ETIMEDOUT error up from ocores_process_polling() instead of setting STATE_ERROR directly. The caller is adjusted to call ocores_process_timeout() on error both in polling and in IRQ mode, which will set STATE_ERROR and generate a stop condition. Fixes: 69c8c0c0efa8 ("i2c: ocores: add polling interface") Signed-off-by: Gregor Herburger Signed-off-by: Matthias Schiffer Acked-by: Peter Korsgaard Reviewed-by: Andrew Lunn Reviewed-by: Federico Vaga Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-ocores.c | 35 ++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/drivers/i2c/busses/i2c-ocores.c b/drivers/i2c/busses/i2c-ocores.c index a0af027db04c..2e575856c5cd 100644 --- a/drivers/i2c/busses/i2c-ocores.c +++ b/drivers/i2c/busses/i2c-ocores.c @@ -342,18 +342,18 @@ static int ocores_poll_wait(struct ocores_i2c *i2c) * ocores_isr(), we just add our polling code around it. * * It can run in atomic context + * + * Return: 0 on success, -ETIMEDOUT on timeout */ -static void ocores_process_polling(struct ocores_i2c *i2c) +static int ocores_process_polling(struct ocores_i2c *i2c) { - while (1) { - irqreturn_t ret; - int err; + irqreturn_t ret; + int err = 0; + while (1) { err = ocores_poll_wait(i2c); - if (err) { - i2c->state = STATE_ERROR; + if (err) break; /* timeout */ - } ret = ocores_isr(-1, i2c); if (ret == IRQ_NONE) @@ -364,13 +364,15 @@ static void ocores_process_polling(struct ocores_i2c *i2c) break; } } + + return err; } static int ocores_xfer_core(struct ocores_i2c *i2c, struct i2c_msg *msgs, int num, bool polling) { - int ret; + int ret = 0; u8 ctrl; ctrl = oc_getreg(i2c, OCI2C_CONTROL); @@ -388,15 +390,16 @@ static int ocores_xfer_core(struct ocores_i2c *i2c, oc_setreg(i2c, OCI2C_CMD, OCI2C_CMD_START); if (polling) { - ocores_process_polling(i2c); + ret = ocores_process_polling(i2c); } else { - ret = wait_event_timeout(i2c->wait, - (i2c->state == STATE_ERROR) || - (i2c->state == STATE_DONE), HZ); - if (ret == 0) { - ocores_process_timeout(i2c); - return -ETIMEDOUT; - } + if (wait_event_timeout(i2c->wait, + (i2c->state == STATE_ERROR) || + (i2c->state == STATE_DONE), HZ) == 0) + ret = -ETIMEDOUT; + } + if (ret) { + ocores_process_timeout(i2c); + return ret; } return (i2c->state == STATE_DONE) ? num : -EIO; From e7067a446264a7514fa1cfaa4052cdb6803bc6a2 Mon Sep 17 00:00:00 2001 From: David Disseldorp Date: Thu, 13 Apr 2023 23:49:57 +0900 Subject: [PATCH 090/175] ksmbd: avoid out of bounds access in decode_preauth_ctxt() Confirm that the accessed pneg_ctxt->HashAlgorithms address sits within the SMB request boundary; deassemble_neg_contexts() only checks that the eight byte smb2_neg_context header + (client controlled) DataLength are within the packet boundary, which is insufficient. Checking for sizeof(struct smb2_preauth_neg_context) is overkill given that the type currently assumes SMB311_SALT_SIZE bytes of trailing Salt. Signed-off-by: David Disseldorp Acked-by: Namjae Jeon Cc: Signed-off-by: Steve French --- fs/ksmbd/smb2pdu.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 8af939a181be..67b7e766a06b 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -876,17 +876,21 @@ static void assemble_neg_contexts(struct ksmbd_conn *conn, } static __le32 decode_preauth_ctxt(struct ksmbd_conn *conn, - struct smb2_preauth_neg_context *pneg_ctxt) + struct smb2_preauth_neg_context *pneg_ctxt, + int len_of_ctxts) { - __le32 err = STATUS_NO_PREAUTH_INTEGRITY_HASH_OVERLAP; + /* + * sizeof(smb2_preauth_neg_context) assumes SMB311_SALT_SIZE Salt, + * which may not be present. Only check for used HashAlgorithms[1]. + */ + if (len_of_ctxts < MIN_PREAUTH_CTXT_DATA_LEN) + return STATUS_INVALID_PARAMETER; - if (pneg_ctxt->HashAlgorithms == SMB2_PREAUTH_INTEGRITY_SHA512) { - conn->preauth_info->Preauth_HashId = - SMB2_PREAUTH_INTEGRITY_SHA512; - err = STATUS_SUCCESS; - } + if (pneg_ctxt->HashAlgorithms != SMB2_PREAUTH_INTEGRITY_SHA512) + return STATUS_NO_PREAUTH_INTEGRITY_HASH_OVERLAP; - return err; + conn->preauth_info->Preauth_HashId = SMB2_PREAUTH_INTEGRITY_SHA512; + return STATUS_SUCCESS; } static void decode_encrypt_ctxt(struct ksmbd_conn *conn, @@ -1014,7 +1018,8 @@ static __le32 deassemble_neg_contexts(struct ksmbd_conn *conn, break; status = decode_preauth_ctxt(conn, - (struct smb2_preauth_neg_context *)pctx); + (struct smb2_preauth_neg_context *)pctx, + len_of_ctxts); if (status != STATUS_SUCCESS) break; } else if (pctx->ContextType == SMB2_ENCRYPTION_CAPABILITIES) { From ef69d2559fe91f23d27a3d6fd640b5641787d22e Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Wed, 29 Mar 2023 10:19:30 +0200 Subject: [PATCH 091/175] riscv: Move early dtb mapping into the fixmap region riscv establishes 2 virtual mappings: - early_pg_dir maps the kernel which allows to discover the system memory - swapper_pg_dir installs the final mapping (linear mapping included) We used to map the dtb in early_pg_dir using DTB_EARLY_BASE_VA, and this mapping was not carried over in swapper_pg_dir. It happens that early_init_fdt_scan_reserved_mem() must be called before swapper_pg_dir is setup otherwise we could allocate reserved memory defined in the dtb. And this function initializes reserved_mem variable with addresses that lie in the early_pg_dir dtb mapping: when those addresses are reused with swapper_pg_dir, this mapping does not exist and then we trap. The previous "fix" was incorrect as early_init_fdt_scan_reserved_mem() must be called before swapper_pg_dir is set up otherwise we could allocate in reserved memory defined in the dtb. So move the dtb mapping in the fixmap region which is established in early_pg_dir and handed over to swapper_pg_dir. Fixes: 922b0375fc93 ("riscv: Fix memblock reservation for device tree blob") Fixes: 8f3a2b4a96dc ("RISC-V: Move DT mapping outof fixmap") Fixes: 50e63dd8ed92 ("riscv: fix reserved memory setup") Reported-by: Conor Dooley Link: https://lore.kernel.org/all/f8e67f82-103d-156c-deb0-d6d6e2756f5e@microchip.com/ Signed-off-by: Alexandre Ghiti Reviewed-by: Conor Dooley Tested-by: Conor Dooley Link: https://lore.kernel.org/r/20230329081932.79831-2-alexghiti@rivosinc.com Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- Documentation/riscv/vm-layout.rst | 6 +-- arch/riscv/include/asm/fixmap.h | 8 ++++ arch/riscv/include/asm/pgtable.h | 8 +++- arch/riscv/kernel/setup.c | 1 - arch/riscv/mm/init.c | 61 +++++++++++++++++-------------- 5 files changed, 51 insertions(+), 33 deletions(-) diff --git a/Documentation/riscv/vm-layout.rst b/Documentation/riscv/vm-layout.rst index 3be44e74ec5d..5462c84f4723 100644 --- a/Documentation/riscv/vm-layout.rst +++ b/Documentation/riscv/vm-layout.rst @@ -47,7 +47,7 @@ RISC-V Linux Kernel SV39 | Kernel-space virtual memory, shared between all processes: ____________________________________________________________|___________________________________________________________ | | | | - ffffffc6fee00000 | -228 GB | ffffffc6feffffff | 2 MB | fixmap + ffffffc6fea00000 | -228 GB | ffffffc6feffffff | 6 MB | fixmap ffffffc6ff000000 | -228 GB | ffffffc6ffffffff | 16 MB | PCI io ffffffc700000000 | -228 GB | ffffffc7ffffffff | 4 GB | vmemmap ffffffc800000000 | -224 GB | ffffffd7ffffffff | 64 GB | vmalloc/ioremap space @@ -83,7 +83,7 @@ RISC-V Linux Kernel SV48 | Kernel-space virtual memory, shared between all processes: ____________________________________________________________|___________________________________________________________ | | | | - ffff8d7ffee00000 | -114.5 TB | ffff8d7ffeffffff | 2 MB | fixmap + ffff8d7ffea00000 | -114.5 TB | ffff8d7ffeffffff | 6 MB | fixmap ffff8d7fff000000 | -114.5 TB | ffff8d7fffffffff | 16 MB | PCI io ffff8d8000000000 | -114.5 TB | ffff8f7fffffffff | 2 TB | vmemmap ffff8f8000000000 | -112.5 TB | ffffaf7fffffffff | 32 TB | vmalloc/ioremap space @@ -119,7 +119,7 @@ RISC-V Linux Kernel SV57 | Kernel-space virtual memory, shared between all processes: ____________________________________________________________|___________________________________________________________ | | | | - ff1bfffffee00000 | -57 PB | ff1bfffffeffffff | 2 MB | fixmap + ff1bfffffea00000 | -57 PB | ff1bfffffeffffff | 6 MB | fixmap ff1bffffff000000 | -57 PB | ff1bffffffffffff | 16 MB | PCI io ff1c000000000000 | -57 PB | ff1fffffffffffff | 1 PB | vmemmap ff20000000000000 | -56 PB | ff5fffffffffffff | 16 PB | vmalloc/ioremap space diff --git a/arch/riscv/include/asm/fixmap.h b/arch/riscv/include/asm/fixmap.h index 5c3e7b97fcc6..0a55099bb734 100644 --- a/arch/riscv/include/asm/fixmap.h +++ b/arch/riscv/include/asm/fixmap.h @@ -22,6 +22,14 @@ */ enum fixed_addresses { FIX_HOLE, + /* + * The fdt fixmap mapping must be PMD aligned and will be mapped + * using PMD entries in fixmap_pmd in 64-bit and a PGD entry in 32-bit. + */ + FIX_FDT_END, + FIX_FDT = FIX_FDT_END + FIX_FDT_SIZE / PAGE_SIZE - 1, + + /* Below fixmaps will be mapped using fixmap_pte */ FIX_PTE, FIX_PMD, FIX_PUD, diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index ab05f892d317..f641837ccf31 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -87,9 +87,13 @@ #define FIXADDR_TOP PCI_IO_START #ifdef CONFIG_64BIT -#define FIXADDR_SIZE PMD_SIZE +#define MAX_FDT_SIZE PMD_SIZE +#define FIX_FDT_SIZE (MAX_FDT_SIZE + SZ_2M) +#define FIXADDR_SIZE (PMD_SIZE + FIX_FDT_SIZE) #else -#define FIXADDR_SIZE PGDIR_SIZE +#define MAX_FDT_SIZE PGDIR_SIZE +#define FIX_FDT_SIZE MAX_FDT_SIZE +#define FIXADDR_SIZE (PGDIR_SIZE + FIX_FDT_SIZE) #endif #define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index 376d2827e736..542eed85ad2c 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -283,7 +283,6 @@ void __init setup_arch(char **cmdline_p) else pr_err("No DTB found in kernel mappings\n"); #endif - early_init_fdt_scan_reserved_mem(); misc_mem_init(); init_resources(); diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 478d6763a01a..fb78d6bbabae 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -57,7 +57,6 @@ unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] EXPORT_SYMBOL(empty_zero_page); extern char _start[]; -#define DTB_EARLY_BASE_VA PGDIR_SIZE void *_dtb_early_va __initdata; uintptr_t _dtb_early_pa __initdata; @@ -236,6 +235,14 @@ static void __init setup_bootmem(void) set_max_mapnr(max_low_pfn - ARCH_PFN_OFFSET); reserve_initrd_mem(); + + /* + * No allocation should be done before reserving the memory as defined + * in the device tree, otherwise the allocation could end up in a + * reserved region. + */ + early_init_fdt_scan_reserved_mem(); + /* * If DTB is built in, no need to reserve its memblock. * Otherwise, do reserve it but avoid using @@ -279,9 +286,6 @@ pgd_t trampoline_pg_dir[PTRS_PER_PGD] __page_aligned_bss; static pte_t fixmap_pte[PTRS_PER_PTE] __page_aligned_bss; pgd_t early_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE); -static p4d_t __maybe_unused early_dtb_p4d[PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE); -static pud_t __maybe_unused early_dtb_pud[PTRS_PER_PUD] __initdata __aligned(PAGE_SIZE); -static pmd_t __maybe_unused early_dtb_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE); #ifdef CONFIG_XIP_KERNEL #define pt_ops (*(struct pt_alloc_ops *)XIP_FIXUP(&pt_ops)) @@ -626,9 +630,6 @@ static void __init create_p4d_mapping(p4d_t *p4dp, #define trampoline_pgd_next (pgtable_l5_enabled ? \ (uintptr_t)trampoline_p4d : (pgtable_l4_enabled ? \ (uintptr_t)trampoline_pud : (uintptr_t)trampoline_pmd)) -#define early_dtb_pgd_next (pgtable_l5_enabled ? \ - (uintptr_t)early_dtb_p4d : (pgtable_l4_enabled ? \ - (uintptr_t)early_dtb_pud : (uintptr_t)early_dtb_pmd)) #else #define pgd_next_t pte_t #define alloc_pgd_next(__va) pt_ops.alloc_pte(__va) @@ -636,7 +637,6 @@ static void __init create_p4d_mapping(p4d_t *p4dp, #define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot) \ create_pte_mapping(__nextp, __va, __pa, __sz, __prot) #define fixmap_pgd_next ((uintptr_t)fixmap_pte) -#define early_dtb_pgd_next ((uintptr_t)early_dtb_pmd) #define create_p4d_mapping(__pmdp, __va, __pa, __sz, __prot) do {} while(0) #define create_pud_mapping(__pmdp, __va, __pa, __sz, __prot) do {} while(0) #define create_pmd_mapping(__pmdp, __va, __pa, __sz, __prot) do {} while(0) @@ -860,32 +860,28 @@ static void __init create_kernel_page_table(pgd_t *pgdir, bool early) * this means 2 PMD entries whereas for 32-bit kernel, this is only 1 PGDIR * entry. */ -static void __init create_fdt_early_page_table(pgd_t *pgdir, uintptr_t dtb_pa) +static void __init create_fdt_early_page_table(pgd_t *pgdir, + uintptr_t fix_fdt_va, + uintptr_t dtb_pa) { -#ifndef CONFIG_BUILTIN_DTB uintptr_t pa = dtb_pa & ~(PMD_SIZE - 1); - create_pgd_mapping(early_pg_dir, DTB_EARLY_BASE_VA, - IS_ENABLED(CONFIG_64BIT) ? early_dtb_pgd_next : pa, - PGDIR_SIZE, - IS_ENABLED(CONFIG_64BIT) ? PAGE_TABLE : PAGE_KERNEL); +#ifndef CONFIG_BUILTIN_DTB + /* Make sure the fdt fixmap address is always aligned on PMD size */ + BUILD_BUG_ON(FIX_FDT % (PMD_SIZE / PAGE_SIZE)); - if (pgtable_l5_enabled) - create_p4d_mapping(early_dtb_p4d, DTB_EARLY_BASE_VA, - (uintptr_t)early_dtb_pud, P4D_SIZE, PAGE_TABLE); - - if (pgtable_l4_enabled) - create_pud_mapping(early_dtb_pud, DTB_EARLY_BASE_VA, - (uintptr_t)early_dtb_pmd, PUD_SIZE, PAGE_TABLE); - - if (IS_ENABLED(CONFIG_64BIT)) { - create_pmd_mapping(early_dtb_pmd, DTB_EARLY_BASE_VA, + /* In 32-bit only, the fdt lies in its own PGD */ + if (!IS_ENABLED(CONFIG_64BIT)) { + create_pgd_mapping(early_pg_dir, fix_fdt_va, + pa, MAX_FDT_SIZE, PAGE_KERNEL); + } else { + create_pmd_mapping(fixmap_pmd, fix_fdt_va, pa, PMD_SIZE, PAGE_KERNEL); - create_pmd_mapping(early_dtb_pmd, DTB_EARLY_BASE_VA + PMD_SIZE, + create_pmd_mapping(fixmap_pmd, fix_fdt_va + PMD_SIZE, pa + PMD_SIZE, PMD_SIZE, PAGE_KERNEL); } - dtb_early_va = (void *)DTB_EARLY_BASE_VA + (dtb_pa & (PMD_SIZE - 1)); + dtb_early_va = (void *)fix_fdt_va + (dtb_pa & (PMD_SIZE - 1)); #else /* * For 64-bit kernel, __va can't be used since it would return a linear @@ -1055,7 +1051,8 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) create_kernel_page_table(early_pg_dir, true); /* Setup early mapping for FDT early scan */ - create_fdt_early_page_table(early_pg_dir, dtb_pa); + create_fdt_early_page_table(early_pg_dir, + __fix_to_virt(FIX_FDT), dtb_pa); /* * Bootime fixmap only can handle PMD_SIZE mapping. Thus, boot-ioremap @@ -1097,6 +1094,16 @@ static void __init setup_vm_final(void) u64 i; /* Setup swapper PGD for fixmap */ +#if !defined(CONFIG_64BIT) + /* + * In 32-bit, the device tree lies in a pgd entry, so it must be copied + * directly in swapper_pg_dir in addition to the pgd entry that points + * to fixmap_pte. + */ + unsigned long idx = pgd_index(__fix_to_virt(FIX_FDT)); + + set_pgd(&swapper_pg_dir[idx], early_pg_dir[idx]); +#endif create_pgd_mapping(swapper_pg_dir, FIXADDR_START, __pa_symbol(fixmap_pgd_next), PGDIR_SIZE, PAGE_TABLE); From f1581626071c8e37c58c5e8f0b4126b17172a211 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Wed, 29 Mar 2023 10:19:31 +0200 Subject: [PATCH 092/175] riscv: Do not set initial_boot_params to the linear address of the dtb early_init_dt_verify() is already called in parse_dtb() and since the dtb address does not change anymore (it is now in the fixmap region), no need to reset initial_boot_params by calling early_init_dt_verify() again. Signed-off-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20230329081932.79831-3-alexghiti@rivosinc.com Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/setup.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index 542eed85ad2c..a059b73f4ddb 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -278,10 +278,7 @@ void __init setup_arch(char **cmdline_p) #if IS_ENABLED(CONFIG_BUILTIN_DTB) unflatten_and_copy_device_tree(); #else - if (early_init_dt_verify(__va(XIP_FIXUP(dtb_early_pa)))) - unflatten_device_tree(); - else - pr_err("No DTB found in kernel mappings\n"); + unflatten_device_tree(); #endif misc_mem_init(); From 1b50f956c8fe9082bdee4a9cfd798149c52f7043 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Wed, 29 Mar 2023 10:19:32 +0200 Subject: [PATCH 093/175] riscv: No need to relocate the dtb as it lies in the fixmap region We used to access the dtb via its linear mapping address but now that the dtb early mapping was moved in the fixmap region, we can keep using this address since it is present in swapper_pg_dir, and remove the dtb relocation. Note that the relocation was wrong anyway since early_memremap() is restricted to 256K whereas the maximum fdt size is 2MB. Signed-off-by: Alexandre Ghiti Reviewed-by: Conor Dooley Tested-by: Conor Dooley Link: https://lore.kernel.org/r/20230329081932.79831-4-alexghiti@rivosinc.com Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/init.c | 21 ++------------------- 1 file changed, 2 insertions(+), 19 deletions(-) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index fb78d6bbabae..0f14f4a8d179 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -249,25 +249,8 @@ static void __init setup_bootmem(void) * early_init_fdt_reserve_self() since __pa() does * not work for DTB pointers that are fixmap addresses */ - if (!IS_ENABLED(CONFIG_BUILTIN_DTB)) { - /* - * In case the DTB is not located in a memory region we won't - * be able to locate it later on via the linear mapping and - * get a segfault when accessing it via __va(dtb_early_pa). - * To avoid this situation copy DTB to a memory region. - * Note that memblock_phys_alloc will also reserve DTB region. - */ - if (!memblock_is_memory(dtb_early_pa)) { - size_t fdt_size = fdt_totalsize(dtb_early_va); - phys_addr_t new_dtb_early_pa = memblock_phys_alloc(fdt_size, PAGE_SIZE); - void *new_dtb_early_va = early_memremap(new_dtb_early_pa, fdt_size); - - memcpy(new_dtb_early_va, dtb_early_va, fdt_size); - early_memunmap(new_dtb_early_va, fdt_size); - _dtb_early_pa = new_dtb_early_pa; - } else - memblock_reserve(dtb_early_pa, fdt_totalsize(dtb_early_va)); - } + if (!IS_ENABLED(CONFIG_BUILTIN_DTB)) + memblock_reserve(dtb_early_pa, fdt_totalsize(dtb_early_va)); dma_contiguous_reserve(dma32_phys_limit); if (IS_ENABLED(CONFIG_64BIT)) From 74391b3e69855e7dd65a9cef36baf5fc1345affd Mon Sep 17 00:00:00 2001 From: Duy Truong Date: Thu, 13 Apr 2023 17:55:48 -0700 Subject: [PATCH 094/175] nvme-pci: add NVME_QUIRK_BOGUS_NID for T-FORCE Z330 SSD Added a quirk to fix the TeamGroup T-Force Cardea Zero Z330 SSDs reporting duplicate NGUIDs. Signed-off-by: Duy Truong Cc: stable@vger.kernel.org Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 282d808400c5..cd7873de3121 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -3443,6 +3443,8 @@ static const struct pci_device_id nvme_id_table[] = { { PCI_DEVICE(0x1d97, 0x2269), /* Lexar NM760 */ .driver_data = NVME_QUIRK_BOGUS_NID | NVME_QUIRK_IGNORE_DEV_SUBNQN, }, + { PCI_DEVICE(0x10ec, 0x5763), /* TEAMGROUP T-FORCE CARDEA ZERO Z330 SSD */ + .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x0061), .driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, }, { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x0065), From 6ab6f98fcdc9d4fbe245aa67de03542deea65322 Mon Sep 17 00:00:00 2001 From: Kai Vehmanen Date: Thu, 13 Apr 2023 22:11:53 +0300 Subject: [PATCH 095/175] ALSA: hda/hdmi: disable KAE for Intel DG2 Use of keep-alive (KAE) has resulted in loss of audio on some A750/770 cards as the transition from keep-alive to stream playback is not working as expected. As there is limited benefit of the new KAE mode on discrete cards, revert back to older silent-stream implementation on these systems. Cc: stable@vger.kernel.org Fixes: 15175a4f2bbb ("ALSA: hda/hdmi: add keep-alive support for ADL-P and DG2") Link: https://gitlab.freedesktop.org/drm/intel/-/issues/8307 Signed-off-by: Kai Vehmanen Link: https://lore.kernel.org/r/20230413191153.3692049-1-kai.vehmanen@linux.intel.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_hdmi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/pci/hda/patch_hdmi.c b/sound/pci/hda/patch_hdmi.c index 4ffa3a59f419..5c6980394dce 100644 --- a/sound/pci/hda/patch_hdmi.c +++ b/sound/pci/hda/patch_hdmi.c @@ -4604,7 +4604,7 @@ HDA_CODEC_ENTRY(0x80862814, "DG1 HDMI", patch_i915_tgl_hdmi), HDA_CODEC_ENTRY(0x80862815, "Alderlake HDMI", patch_i915_tgl_hdmi), HDA_CODEC_ENTRY(0x80862816, "Rocketlake HDMI", patch_i915_tgl_hdmi), HDA_CODEC_ENTRY(0x80862818, "Raptorlake HDMI", patch_i915_tgl_hdmi), -HDA_CODEC_ENTRY(0x80862819, "DG2 HDMI", patch_i915_adlp_hdmi), +HDA_CODEC_ENTRY(0x80862819, "DG2 HDMI", patch_i915_tgl_hdmi), HDA_CODEC_ENTRY(0x8086281a, "Jasperlake HDMI", patch_i915_icl_hdmi), HDA_CODEC_ENTRY(0x8086281b, "Elkhartlake HDMI", patch_i915_icl_hdmi), HDA_CODEC_ENTRY(0x8086281c, "Alderlake-P HDMI", patch_i915_adlp_hdmi), From 3037933448f60f9acb705997eae62013ecb81e0d Mon Sep 17 00:00:00 2001 From: Gwangun Jung Date: Thu, 13 Apr 2023 19:35:54 +0900 Subject: [PATCH 096/175] net: sched: sch_qfq: prevent slab-out-of-bounds in qfq_activate_agg If the TCA_QFQ_LMAX value is not offered through nlattr, lmax is determined by the MTU value of the network device. The MTU of the loopback device can be set up to 2^31-1. As a result, it is possible to have an lmax value that exceeds QFQ_MIN_LMAX. Due to the invalid lmax value, an index is generated that exceeds the QFQ_MAX_INDEX(=24) value, causing out-of-bounds read/write errors. The following reports a oob access: [ 84.582666] BUG: KASAN: slab-out-of-bounds in qfq_activate_agg.constprop.0 (net/sched/sch_qfq.c:1027 net/sched/sch_qfq.c:1060 net/sched/sch_qfq.c:1313) [ 84.583267] Read of size 4 at addr ffff88810f676948 by task ping/301 [ 84.583686] [ 84.583797] CPU: 3 PID: 301 Comm: ping Not tainted 6.3.0-rc5 #1 [ 84.584164] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 [ 84.584644] Call Trace: [ 84.584787] [ 84.584906] dump_stack_lvl (lib/dump_stack.c:107 (discriminator 1)) [ 84.585108] print_report (mm/kasan/report.c:320 mm/kasan/report.c:430) [ 84.585570] kasan_report (mm/kasan/report.c:538) [ 84.585988] qfq_activate_agg.constprop.0 (net/sched/sch_qfq.c:1027 net/sched/sch_qfq.c:1060 net/sched/sch_qfq.c:1313) [ 84.586599] qfq_enqueue (net/sched/sch_qfq.c:1255) [ 84.587607] dev_qdisc_enqueue (net/core/dev.c:3776) [ 84.587749] __dev_queue_xmit (./include/net/sch_generic.h:186 net/core/dev.c:3865 net/core/dev.c:4212) [ 84.588763] ip_finish_output2 (./include/net/neighbour.h:546 net/ipv4/ip_output.c:228) [ 84.589460] ip_output (net/ipv4/ip_output.c:430) [ 84.590132] ip_push_pending_frames (./include/net/dst.h:444 net/ipv4/ip_output.c:126 net/ipv4/ip_output.c:1586 net/ipv4/ip_output.c:1606) [ 84.590285] raw_sendmsg (net/ipv4/raw.c:649) [ 84.591960] sock_sendmsg (net/socket.c:724 net/socket.c:747) [ 84.592084] __sys_sendto (net/socket.c:2142) [ 84.593306] __x64_sys_sendto (net/socket.c:2150) [ 84.593779] do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80) [ 84.593902] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:120) [ 84.594070] RIP: 0033:0x7fe568032066 [ 84.594192] Code: 0e 0d 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b8 0f 1f 00 41 89 ca 64 8b 04 25 18 00 00 00 85 c09[ 84.594796] RSP: 002b:00007ffce388b4e8 EFLAGS: 00000246 ORIG_RAX: 000000000000002c Code starting with the faulting instruction =========================================== [ 84.595047] RAX: ffffffffffffffda RBX: 00007ffce388cc70 RCX: 00007fe568032066 [ 84.595281] RDX: 0000000000000040 RSI: 00005605fdad6d10 RDI: 0000000000000003 [ 84.595515] RBP: 00005605fdad6d10 R08: 00007ffce388eeec R09: 0000000000000010 [ 84.595749] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000040 [ 84.595984] R13: 00007ffce388cc30 R14: 00007ffce388b4f0 R15: 0000001d00000001 [ 84.596218] [ 84.596295] [ 84.596351] Allocated by task 291: [ 84.596467] kasan_save_stack (mm/kasan/common.c:46) [ 84.596597] kasan_set_track (mm/kasan/common.c:52) [ 84.596725] __kasan_kmalloc (mm/kasan/common.c:384) [ 84.596852] __kmalloc_node (./include/linux/kasan.h:196 mm/slab_common.c:967 mm/slab_common.c:974) [ 84.596979] qdisc_alloc (./include/linux/slab.h:610 ./include/linux/slab.h:731 net/sched/sch_generic.c:938) [ 84.597100] qdisc_create (net/sched/sch_api.c:1244) [ 84.597222] tc_modify_qdisc (net/sched/sch_api.c:1680) [ 84.597357] rtnetlink_rcv_msg (net/core/rtnetlink.c:6174) [ 84.597495] netlink_rcv_skb (net/netlink/af_netlink.c:2574) [ 84.597627] netlink_unicast (net/netlink/af_netlink.c:1340 net/netlink/af_netlink.c:1365) [ 84.597759] netlink_sendmsg (net/netlink/af_netlink.c:1942) [ 84.597891] sock_sendmsg (net/socket.c:724 net/socket.c:747) [ 84.598016] ____sys_sendmsg (net/socket.c:2501) [ 84.598147] ___sys_sendmsg (net/socket.c:2557) [ 84.598275] __sys_sendmsg (./include/linux/file.h:31 net/socket.c:2586) [ 84.598399] do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80) [ 84.598520] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:120) [ 84.598688] [ 84.598744] The buggy address belongs to the object at ffff88810f674000 [ 84.598744] which belongs to the cache kmalloc-8k of size 8192 [ 84.599135] The buggy address is located 2664 bytes to the right of [ 84.599135] allocated 7904-byte region [ffff88810f674000, ffff88810f675ee0) [ 84.599544] [ 84.599598] The buggy address belongs to the physical page: [ 84.599777] page:00000000e638567f refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x10f670 [ 84.600074] head:00000000e638567f order:3 entire_mapcount:0 nr_pages_mapped:0 pincount:0 [ 84.600330] flags: 0x200000000010200(slab|head|node=0|zone=2) [ 84.600517] raw: 0200000000010200 ffff888100043180 dead000000000122 0000000000000000 [ 84.600764] raw: 0000000000000000 0000000080020002 00000001ffffffff 0000000000000000 [ 84.601009] page dumped because: kasan: bad access detected [ 84.601187] [ 84.601241] Memory state around the buggy address: [ 84.601396] ffff88810f676800: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 84.601620] ffff88810f676880: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 84.601845] >ffff88810f676900: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 84.602069] ^ [ 84.602243] ffff88810f676980: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 84.602468] ffff88810f676a00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 84.602693] ================================================================== [ 84.602924] Disabling lock debugging due to kernel taint Fixes: 3015f3d2a3cd ("pkt_sched: enable QFQ to support TSO/GSO") Reported-by: Gwangun Jung Signed-off-by: Gwangun Jung Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/sch_qfq.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index cf5ebe43b3b4..02098a02943e 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -421,15 +421,16 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, } else weight = 1; - if (tb[TCA_QFQ_LMAX]) { + if (tb[TCA_QFQ_LMAX]) lmax = nla_get_u32(tb[TCA_QFQ_LMAX]); - if (lmax < QFQ_MIN_LMAX || lmax > (1UL << QFQ_MTU_SHIFT)) { - pr_notice("qfq: invalid max length %u\n", lmax); - return -EINVAL; - } - } else + else lmax = psched_mtu(qdisc_dev(sch)); + if (lmax < QFQ_MIN_LMAX || lmax > (1UL << QFQ_MTU_SHIFT)) { + pr_notice("qfq: invalid max length %u\n", lmax); + return -EINVAL; + } + inv_w = ONE_FP / weight; weight = ONE_FP / inv_w; From 43235168793cb1d766ccd015c219068e0547c511 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Fri, 14 Apr 2023 10:46:19 +0200 Subject: [PATCH 097/175] firmware/psci: demote suspend-mode warning to info level On some Qualcomm platforms, like SC8280XP, the attempt to set PC mode during boot fails with PSCI_RET_DENIED and since commit 998fcd001feb ("firmware/psci: Print a warning if PSCI doesn't accept PC mode") this is now logged at warning level: psci: failed to set PC mode: -3 As there is nothing users can do about the firmware behaving this way, demote the warning to info level and clearly mark it as a firmware bug: psci: [Firmware Bug]: failed to set PC mode: -3 Reviewed-by: Ulf Hansson Acked-by: Mark Rutland Acked-by: Sudeep Holla Signed-off-by: Johan Hovold Acked-by: Dmitry Baryshkov Signed-off-by: Arnd Bergmann --- drivers/firmware/psci/psci.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/firmware/psci/psci.c b/drivers/firmware/psci/psci.c index 29619f49873a..d9629ff87861 100644 --- a/drivers/firmware/psci/psci.c +++ b/drivers/firmware/psci/psci.c @@ -167,7 +167,8 @@ int psci_set_osi_mode(bool enable) err = invoke_psci_fn(PSCI_1_0_FN_SET_SUSPEND_MODE, suspend_mode, 0, 0); if (err < 0) - pr_warn("failed to set %s mode: %d\n", enable ? "OSI" : "PC", err); + pr_info(FW_BUG "failed to set %s mode: %d\n", + enable ? "OSI" : "PC", err); return psci_to_linux_errno(err); } From 860e1c7f8b0b43fbf91b4d689adfaa13adb89452 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 14 Apr 2023 15:53:13 +0800 Subject: [PATCH 098/175] io_uring: complete request via task work in case of DEFER_TASKRUN So far io_req_complete_post() only covers DEFER_TASKRUN by completing request via task work when the request is completed from IOWQ. However, uring command could be completed from any context, and if io uring is setup with DEFER_TASKRUN, the command is required to be completed from current context, otherwise wait on IORING_ENTER_GETEVENTS can't be wakeup, and may hang forever. The issue can be observed on removing ublk device, but turns out it is one generic issue for uring command & DEFER_TASKRUN, so solve it in io_uring core code. Fixes: e6aeb2721d3b ("io_uring: complete all requests in task context") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/linux-block/b3fc9991-4c53-9218-a8cc-5b4dd3952108@kernel.dk/ Reported-by: Jens Axboe Cc: Kanchan Joshi Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 2a8b8c304d2a..4a865f0e85d0 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -998,7 +998,7 @@ static void __io_req_complete_post(struct io_kiocb *req) void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) { - if (req->ctx->task_complete && (issue_flags & IO_URING_F_IOWQ)) { + if (req->ctx->task_complete && req->ctx->submitter_task != current) { req->io_task_work.func = io_req_task_complete; io_req_task_work_add(req); } else if (!(issue_flags & IO_URING_F_UNLOCKED) || From c730fce7c70cfce831f4bdc9e49880ba1f61a092 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 14 Apr 2023 17:47:55 +0200 Subject: [PATCH 099/175] s390/bpf: Fix bpf_arch_text_poke() with new_addr == NULL Thomas Richter reported a crash in linux-next with a backtrace similar to the following one: [<0000000000000000>] 0x0 ([<000000000031a182>] bpf_trace_run4+0xc2/0x218) [<00000000001d59f4>] __bpf_trace_sched_switch+0x1c/0x28 [<0000000000c44a3a>] __schedule+0x43a/0x890 [<0000000000c44ef8>] schedule+0x68/0x110 [<0000000000c4e5ca>] do_nanosleep+0xa2/0x168 [<000000000026e7fe>] hrtimer_nanosleep+0xf6/0x1c0 [<000000000026eb6e>] __s390x_sys_nanosleep+0xb6/0xf0 [<0000000000c3b81c>] __do_syscall+0x1e4/0x208 [<0000000000c50510>] system_call+0x70/0x98 Last Breaking-Event-Address: [<000003ff7fda1814>] bpf_prog_65e887c70a835bbf_on_switch+0x1a4/0x1f0 The problem is that bpf_arch_text_poke() with new_addr == NULL is susceptible to the following race condition: T1 T2 ----------------- ------------------- plt.target = NULL entry: brcl 0xf,plt entry.mask = 0 lgrl %r1,plt.target br %r1 Fix by setting PLT target to the instruction following `brcl 0xf,plt` instead of 0. This way T2 will simply resume the execution of the eBPF program, which is the desired effect of passing new_addr == NULL. Fixes: f1d5df84cd8c ("s390/bpf: Implement bpf_arch_text_poke()") Reported-by: Thomas Richter Signed-off-by: Ilya Leoshkevich Signed-off-by: Daniel Borkmann Reviewed-by: Heiko Carstens Link: https://lore.kernel.org/bpf/20230414154755.184502-1-iii@linux.ibm.com --- arch/s390/net/bpf_jit_comp.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index d0846ba818ee..6b1876e4ad3f 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -539,7 +539,7 @@ static void bpf_jit_plt(void *plt, void *ret, void *target) { memcpy(plt, bpf_plt, BPF_PLT_SIZE); *(void **)((char *)plt + (bpf_plt_ret - bpf_plt)) = ret; - *(void **)((char *)plt + (bpf_plt_target - bpf_plt)) = target; + *(void **)((char *)plt + (bpf_plt_target - bpf_plt)) = target ?: ret; } /* @@ -2010,7 +2010,9 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, } __packed insn; char expected_plt[BPF_PLT_SIZE]; char current_plt[BPF_PLT_SIZE]; + char new_plt[BPF_PLT_SIZE]; char *plt; + char *ret; int err; /* Verify the branch to be patched. */ @@ -2032,12 +2034,15 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, err = copy_from_kernel_nofault(current_plt, plt, BPF_PLT_SIZE); if (err < 0) return err; - bpf_jit_plt(expected_plt, (char *)ip + 6, old_addr); + ret = (char *)ip + 6; + bpf_jit_plt(expected_plt, ret, old_addr); if (memcmp(current_plt, expected_plt, BPF_PLT_SIZE)) return -EINVAL; /* Adjust the call address. */ + bpf_jit_plt(new_plt, ret, new_addr); s390_kernel_write(plt + (bpf_plt_target - bpf_plt), - &new_addr, sizeof(void *)); + new_plt + (bpf_plt_target - bpf_plt), + sizeof(void *)); } /* Adjust the mask of the branch. */ From 5105a7ffce19160e7062aee67fb6b3b8a1b56d78 Mon Sep 17 00:00:00 2001 From: David Disseldorp Date: Fri, 7 Apr 2023 00:34:11 +0200 Subject: [PATCH 100/175] cifs: fix negotiate context parsing smb311_decode_neg_context() doesn't properly check against SMB packet boundaries prior to accessing individual negotiate context entries. This is due to the length check omitting the eight byte smb2_neg_context header, as well as incorrect decrementing of len_of_ctxts. Fixes: 5100d8a3fe03 ("SMB311: Improve checking of negotiate security contexts") Reported-by: Volker Lendecke Reviewed-by: Paulo Alcantara (SUSE) Signed-off-by: David Disseldorp Signed-off-by: Steve French --- fs/cifs/smb2pdu.c | 41 +++++++++++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 2b92132097dc..4245249dbba8 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -587,11 +587,15 @@ assemble_neg_contexts(struct smb2_negotiate_req *req, } +/* If invalid preauth context warn but use what we requested, SHA-512 */ static void decode_preauth_context(struct smb2_preauth_neg_context *ctxt) { unsigned int len = le16_to_cpu(ctxt->DataLength); - /* If invalid preauth context warn but use what we requested, SHA-512 */ + /* + * Caller checked that DataLength remains within SMB boundary. We still + * need to confirm that one HashAlgorithms member is accounted for. + */ if (len < MIN_PREAUTH_CTXT_DATA_LEN) { pr_warn_once("server sent bad preauth context\n"); return; @@ -610,7 +614,11 @@ static void decode_compress_ctx(struct TCP_Server_Info *server, { unsigned int len = le16_to_cpu(ctxt->DataLength); - /* sizeof compress context is a one element compression capbility struct */ + /* + * Caller checked that DataLength remains within SMB boundary. We still + * need to confirm that one CompressionAlgorithms member is accounted + * for. + */ if (len < 10) { pr_warn_once("server sent bad compression cntxt\n"); return; @@ -632,6 +640,11 @@ static int decode_encrypt_ctx(struct TCP_Server_Info *server, unsigned int len = le16_to_cpu(ctxt->DataLength); cifs_dbg(FYI, "decode SMB3.11 encryption neg context of len %d\n", len); + /* + * Caller checked that DataLength remains within SMB boundary. We still + * need to confirm that one Cipher flexible array member is accounted + * for. + */ if (len < MIN_ENCRYPT_CTXT_DATA_LEN) { pr_warn_once("server sent bad crypto ctxt len\n"); return -EINVAL; @@ -678,6 +691,11 @@ static void decode_signing_ctx(struct TCP_Server_Info *server, { unsigned int len = le16_to_cpu(pctxt->DataLength); + /* + * Caller checked that DataLength remains within SMB boundary. We still + * need to confirm that one SigningAlgorithms flexible array member is + * accounted for. + */ if ((len < 4) || (len > 16)) { pr_warn_once("server sent bad signing negcontext\n"); return; @@ -719,14 +737,19 @@ static int smb311_decode_neg_context(struct smb2_negotiate_rsp *rsp, for (i = 0; i < ctxt_cnt; i++) { int clen; /* check that offset is not beyond end of SMB */ - if (len_of_ctxts == 0) - break; - if (len_of_ctxts < sizeof(struct smb2_neg_context)) break; pctx = (struct smb2_neg_context *)(offset + (char *)rsp); - clen = le16_to_cpu(pctx->DataLength); + clen = sizeof(struct smb2_neg_context) + + le16_to_cpu(pctx->DataLength); + /* + * 2.2.4 SMB2 NEGOTIATE Response + * Subsequent negotiate contexts MUST appear at the first 8-byte + * aligned offset following the previous negotiate context. + */ + if (i + 1 != ctxt_cnt) + clen = ALIGN(clen, 8); if (clen > len_of_ctxts) break; @@ -747,12 +770,10 @@ static int smb311_decode_neg_context(struct smb2_negotiate_rsp *rsp, else cifs_server_dbg(VFS, "unknown negcontext of type %d ignored\n", le16_to_cpu(pctx->ContextType)); - if (rc) break; - /* offsets must be 8 byte aligned */ - clen = ALIGN(clen, 8); - offset += clen + sizeof(struct smb2_neg_context); + + offset += clen; len_of_ctxts -= clen; } return rc; From 5efb685bb3af112038af78a2cdf28f0ffdad45f5 Mon Sep 17 00:00:00 2001 From: Benjamin Gray Date: Mon, 20 Mar 2023 15:08:38 +1100 Subject: [PATCH 101/175] initramfs: Check negative timestamp to prevent broken cpio archive Similar to commit 4c9d410f32b3 ("initramfs: Check timestamp to prevent broken cpio archive"), except asserts that the timestamp is non-negative. This can happen when the KBUILD_BUILD_TIMESTAMP is a value before UNIX epoch, which may be set when making reproducible builds that don't want to look like they use a valid date. While support for dates before 1970 might not be supported, this is more about preventing undetected CPIO corruption. The printf's use a minimum length format specifier, and will happily make the field longer than 8 characters if they need to. Signed-off-by: Benjamin Gray Reviewed-by: Andrew Donnellan Tested-by: Andrew Donnellan Signed-off-by: Masahiro Yamada --- usr/gen_init_cpio.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/usr/gen_init_cpio.c b/usr/gen_init_cpio.c index ee01e40e8bc6..61230532fef1 100644 --- a/usr/gen_init_cpio.c +++ b/usr/gen_init_cpio.c @@ -353,6 +353,12 @@ static int cpio_mkfile(const char *name, const char *location, buf.st_mtime = 0xffffffff; } + if (buf.st_mtime < 0) { + fprintf(stderr, "%s: Timestamp negative, clipping.\n", + location); + buf.st_mtime = 0; + } + if (buf.st_size > 0xffffffff) { fprintf(stderr, "%s: Size exceeds maximum cpio file size\n", location); @@ -602,10 +608,10 @@ int main (int argc, char *argv[]) /* * Timestamps after 2106-02-07 06:28:15 UTC have an ascii hex time_t * representation that exceeds 8 chars and breaks the cpio header - * specification. + * specification. Negative timestamps similarly exceed 8 chars. */ - if (default_mtime > 0xffffffff) { - fprintf(stderr, "ERROR: Timestamp too large for cpio format\n"); + if (default_mtime > 0xffffffff || default_mtime < 0) { + fprintf(stderr, "ERROR: Timestamp out of range for cpio format\n"); exit(1); } From 735faf92fb06d083ddcf6cfcf6665666dea5dcc1 Mon Sep 17 00:00:00 2001 From: Benjamin Gray Date: Tue, 21 Mar 2023 10:05:34 +1100 Subject: [PATCH 102/175] init/initramfs: Fix argument forwarding to panic() in panic_show_mem() Forwarding variadic argument lists can't be done by passing a va_list to a function with signature foo(...) (as panic() has). It ends up interpreting the va_list itself as a single argument instead of iterating it. printf() happily accepts it of course, leading to corrupt output. Convert panic_show_mem() to a macro to allow forwarding the arguments. The function is trivial enough that it's easier than trying to introduce a vpanic() variant. Signed-off-by: Benjamin Gray Reviewed-by: Andrew Donnellan Signed-off-by: Masahiro Yamada --- init/initramfs.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/init/initramfs.c b/init/initramfs.c index f6c112e30bd4..e7a01c2ccd1b 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -60,15 +60,8 @@ static void __init error(char *x) message = x; } -static void panic_show_mem(const char *fmt, ...) -{ - va_list args; - - show_mem(0, NULL); - va_start(args, fmt); - panic(fmt, args); - va_end(args); -} +#define panic_show_mem(fmt, ...) \ + ({ show_mem(0, NULL); panic(fmt, ##__VA_ARGS__); }) /* link hash */ From f6d8283549bc200e2babdd627239ece3547d634c Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 7 Apr 2023 19:16:27 +0900 Subject: [PATCH 103/175] kbuild: merge cmd_archive_linux and cmd_archive_perf The two commands, cmd_archive_linux and cmd_archive_perf, are similar. Merge them to make it easier to add more changes to the git-archive command. Signed-off-by: Masahiro Yamada Reviewed-by: Nathan Chancellor --- scripts/Makefile.package | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/scripts/Makefile.package b/scripts/Makefile.package index 49aff12cb6ab..d80a9b59f784 100644 --- a/scripts/Makefile.package +++ b/scripts/Makefile.package @@ -57,16 +57,17 @@ check-git: false; \ fi +quiet_cmd_archive = ARCHIVE $@ + cmd_archive = git -C $(srctree) archive \ + --output=$$(realpath $@) --prefix=$(basename $@)/ $(archive-args) + # Linux source tarball # --------------------------------------------------------------------------- -quiet_cmd_archive_linux = ARCHIVE $@ - cmd_archive_linux = \ - git -C $(srctree) archive --output=$$(realpath $@) --prefix=$(basename $@)/ $$(cat $<) - targets += linux.tar +linux.tar: archive-args = $$(cat $<) linux.tar: .tmp_HEAD FORCE - $(call if_changed,archive_linux) + $(call if_changed,archive) # rpm-pkg # --------------------------------------------------------------------------- @@ -181,16 +182,14 @@ quiet_cmd_perf_version_file = GEN $@ .tmp_perf/PERF-VERSION-FILE: .tmp_HEAD $(srctree)/tools/perf/util/PERF-VERSION-GEN | .tmp_perf $(call cmd,perf_version_file) -quiet_cmd_archive_perf = ARCHIVE $@ - cmd_archive_perf = \ - git -C $(srctree) archive --output=$$(realpath $@) --prefix=$(basename $@)/ \ - --add-file=$$(realpath $(word 2, $^)) \ +perf-archive-args = --add-file=$$(realpath $(word 2, $^)) \ --add-file=$$(realpath $(word 3, $^)) \ $$(cat $(word 2, $^))^{tree} $$(cat $<) targets += perf-$(KERNELVERSION).tar +perf-$(KERNELVERSION).tar: archive-args = $(perf-archive-args) perf-$(KERNELVERSION).tar: tools/perf/MANIFEST .tmp_perf/HEAD .tmp_perf/PERF-VERSION-FILE FORCE - $(call if_changed,archive_perf) + $(call if_changed,archive) PHONY += perf-tar-src-pkg perf-tar-src-pkg: perf-$(KERNELVERSION).tar From f8d94c4e403c89ec6b09ba69f65e4547ba99dd07 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 7 Apr 2023 19:16:28 +0900 Subject: [PATCH 104/175] kbuild: do not create intermediate *.tar for source tarballs Since commit 05e96e96a315 ("kbuild: use git-archive for source package creation"), a source tarball is created in two steps; create *.tar file then compress it. I split the compression as a separate rule because I just thought 'git archive' supported only gzip. For other compression algorithms, I could pipe the two commands: $ git archive HEAD | xz > linux.tar.xz I read git-archive(1) carefully, and I realized GIT had provided a more elegant way: $ git -c tar.tar.xz.command=xz archive -o linux.tar.xz HEAD This commit uses 'tar.tar.*.command' configuration to specify the compression backend so we can compress a source tarball on-the-fly. GIT commit 767cf4579f0e ("archive: implement configurable tar filters") is more than a decade old, so it should be available on almost all build environments. Signed-off-by: Masahiro Yamada Reviewed-by: Nathan Chancellor --- scripts/Makefile.package | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/scripts/Makefile.package b/scripts/Makefile.package index d80a9b59f784..ed1784be72ce 100644 --- a/scripts/Makefile.package +++ b/scripts/Makefile.package @@ -57,16 +57,23 @@ check-git: false; \ fi +git-config-tar.gz = -c tar.tar.gz.command="$(KGZIP)" +git-config-tar.bz2 = -c tar.tar.bz2.command="$(KBZIP2)" +git-config-tar.xz = -c tar.tar.xz.command="$(XZ)" +git-config-tar.zst = -c tar.tar.zst.command="$(ZSTD)" + quiet_cmd_archive = ARCHIVE $@ - cmd_archive = git -C $(srctree) archive \ + cmd_archive = git -C $(srctree) $(git-config-tar$(suffix $@)) archive \ --output=$$(realpath $@) --prefix=$(basename $@)/ $(archive-args) # Linux source tarball # --------------------------------------------------------------------------- -targets += linux.tar -linux.tar: archive-args = $$(cat $<) -linux.tar: .tmp_HEAD FORCE +linux-tarballs := $(addprefix linux, .tar.gz) + +targets += $(linux-tarballs) +$(linux-tarballs): archive-args = $$(cat $<) +$(linux-tarballs): .tmp_HEAD FORCE $(call if_changed,archive) # rpm-pkg @@ -186,9 +193,12 @@ perf-archive-args = --add-file=$$(realpath $(word 2, $^)) \ --add-file=$$(realpath $(word 3, $^)) \ $$(cat $(word 2, $^))^{tree} $$(cat $<) -targets += perf-$(KERNELVERSION).tar -perf-$(KERNELVERSION).tar: archive-args = $(perf-archive-args) -perf-$(KERNELVERSION).tar: tools/perf/MANIFEST .tmp_perf/HEAD .tmp_perf/PERF-VERSION-FILE FORCE + +perf-tarballs := $(addprefix perf-$(KERNELVERSION), .tar .tar.gz .tar.bz2 .tar.xz .tar.zst) + +targets += $(perf-tarballs) +$(perf-tarballs): archive-args = $(perf-archive-args) +$(perf-tarballs): tools/perf/MANIFEST .tmp_perf/HEAD .tmp_perf/PERF-VERSION-FILE FORCE $(call if_changed,archive) PHONY += perf-tar-src-pkg From 3c65a2704cdd2a0cd0766352e587bae4a6268155 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 7 Apr 2023 19:16:29 +0900 Subject: [PATCH 105/175] kbuild: do not create intermediate *.tar for tar packages Commit 05e96e96a315 ("kbuild: use git-archive for source package creation") split the compression as a separate step to factor out the common build rules. With the previous commit, we got back to the situation where source tarballs are compressed on-the-fly. There is no reason to keep the separate compression rules. Generate the comressed tar packages directly. Signed-off-by: Masahiro Yamada Reviewed-by: Nathan Chancellor --- scripts/Makefile.package | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/scripts/Makefile.package b/scripts/Makefile.package index ed1784be72ce..4d90691505b1 100644 --- a/scripts/Makefile.package +++ b/scripts/Makefile.package @@ -27,21 +27,6 @@ fi ; \ tar -I $(KGZIP) -c $(RCS_TAR_IGNORE) -f $(2).tar.gz \ --transform 's:^:$(2)/:S' $(TAR_CONTENT) $(3) -# tarball compression -# --------------------------------------------------------------------------- - -%.tar.gz: %.tar - $(call cmd,gzip) - -%.tar.bz2: %.tar - $(call cmd,bzip2) - -%.tar.xz: %.tar - $(call cmd,xzmisc) - -%.tar.zst: %.tar - $(call cmd,zstd) - # Git # --------------------------------------------------------------------------- @@ -154,10 +139,17 @@ tar-install: FORCE $(Q)$(MAKE) -f $(srctree)/Makefile +$(Q)$(srctree)/scripts/package/buildtar $@ -quiet_cmd_tar = TAR $@ - cmd_tar = cd $<; tar cf ../$@ --owner=root --group=root --sort=name * +compress-tar.gz = -I "$(KGZIP)" +compress-tar.bz2 = -I "$(KBZIP2)" +compress-tar.xz = -I "$(XZ)" +compress-tar.zst = -I "$(ZSTD)" -linux-$(KERNELRELEASE)-$(ARCH).tar: tar-install +quiet_cmd_tar = TAR $@ + cmd_tar = cd $<; tar cf ../$@ $(compress-tar$(suffix $@)) --owner=root --group=root --sort=name * + +dir-tarballs := $(addprefix linux-$(KERNELRELEASE)-$(ARCH), .tar .tar.gz .tar.bz2 .tar.xz .tar.zst) + +$(dir-tarballs): tar-install $(call cmd,tar) PHONY += dir-pkg From 998ad18b00ebc0ef5a85be97fc020e710afc88ce Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 6 Apr 2023 00:18:53 +0800 Subject: [PATCH 106/175] mm: swap: fix performance regression on sparsetruncate-tiny The ->percpu_pvec_drained was originally introduced by commit d9ed0d08b6c6 ("mm: only drain per-cpu pagevecs once per pagevec usage") to drain per-cpu pagevecs only once per pagevec usage. But after converting the swap code to be more folio-based, the commit c2bc16817aa0 ("mm/swap: add folio_batch_move_lru()") breaks this logic, which would cause ->percpu_pvec_drained to be reset to false, that means per-cpu pagevecs will be drained multiple times per pagevec usage. In theory, there should be no functional changes when converting code to be more folio-based. We should call folio_batch_reinit() in folio_batch_move_lru() instead of folio_batch_init(). And to verify that we still need ->percpu_pvec_drained, I ran mmtests/sparsetruncate-tiny and got the following data: baseline with baseline/ patch/ Min Time 326.00 ( 0.00%) 328.00 ( -0.61%) 1st-qrtle Time 334.00 ( 0.00%) 336.00 ( -0.60%) 2nd-qrtle Time 338.00 ( 0.00%) 341.00 ( -0.89%) 3rd-qrtle Time 343.00 ( 0.00%) 347.00 ( -1.17%) Max-1 Time 326.00 ( 0.00%) 328.00 ( -0.61%) Max-5 Time 327.00 ( 0.00%) 330.00 ( -0.92%) Max-10 Time 328.00 ( 0.00%) 331.00 ( -0.91%) Max-90 Time 350.00 ( 0.00%) 357.00 ( -2.00%) Max-95 Time 395.00 ( 0.00%) 390.00 ( 1.27%) Max-99 Time 508.00 ( 0.00%) 434.00 ( 14.57%) Max Time 547.00 ( 0.00%) 476.00 ( 12.98%) Amean Time 344.61 ( 0.00%) 345.56 * -0.28%* Stddev Time 30.34 ( 0.00%) 19.51 ( 35.69%) CoeffVar Time 8.81 ( 0.00%) 5.65 ( 35.87%) BAmean-99 Time 342.38 ( 0.00%) 344.27 ( -0.55%) BAmean-95 Time 338.58 ( 0.00%) 341.87 ( -0.97%) BAmean-90 Time 336.89 ( 0.00%) 340.26 ( -1.00%) BAmean-75 Time 335.18 ( 0.00%) 338.40 ( -0.96%) BAmean-50 Time 332.54 ( 0.00%) 335.42 ( -0.87%) BAmean-25 Time 329.30 ( 0.00%) 332.00 ( -0.82%) From the above it can be seen that we get similar data to when ->percpu_pvec_drained was introduced, so we still need it. Let's call folio_batch_reinit() in folio_batch_move_lru() to restore the original logic. Link: https://lkml.kernel.org/r/20230405161854.6931-1-zhengqi.arch@bytedance.com Fixes: c2bc16817aa0 ("mm/swap: add folio_batch_move_lru()") Signed-off-by: Qi Zheng Reviewed-by: Matthew Wilcox (Oracle) Acked-by: Mel Gorman Cc: Lorenzo Stoakes Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/swap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/swap.c b/mm/swap.c index 57cb01b042f6..423199ee8478 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -222,7 +222,7 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) if (lruvec) unlock_page_lruvec_irqrestore(lruvec, flags); folios_put(fbatch->folios, folio_batch_count(fbatch)); - folio_batch_init(fbatch); + folio_batch_reinit(fbatch); } static void folio_batch_add_and_move(struct folio_batch *fbatch, From 24bf08c4376be417f16ceb609188b16f461b0443 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 5 Apr 2023 18:02:35 +0200 Subject: [PATCH 107/175] mm/userfaultfd: fix uffd-wp handling for THP migration entries Looks like what we fixed for hugetlb in commit 44f86392bdd1 ("mm/hugetlb: fix uffd-wp handling for migration entries in hugetlb_change_protection()") similarly applies to THP. Setting/clearing uffd-wp on THP migration entries is not implemented properly. Further, while removing migration PMDs considers the uffd-wp bit, inserting migration PMDs does not consider the uffd-wp bit. We have to set/clear independently of the migration entry type in change_huge_pmd() and properly copy the uffd-wp bit in set_pmd_migration_entry(). Verified using a simple reproducer that triggers migration of a THP, that the set_pmd_migration_entry() no longer loses the uffd-wp bit. Link: https://lkml.kernel.org/r/20230405160236.587705-2-david@redhat.com Fixes: f45ec5ff16a7 ("userfaultfd: wp: support swap and page migration") Signed-off-by: David Hildenbrand Reviewed-by: Peter Xu Cc: Cc: Muhammad Usama Anjum Signed-off-by: Andrew Morton --- mm/huge_memory.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 032fb0ef9cd1..e3706a2b34b2 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1838,10 +1838,10 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (is_swap_pmd(*pmd)) { swp_entry_t entry = pmd_to_swp_entry(*pmd); struct page *page = pfn_swap_entry_to_page(entry); + pmd_t newpmd; VM_BUG_ON(!is_pmd_migration_entry(*pmd)); if (is_writable_migration_entry(entry)) { - pmd_t newpmd; /* * A protection check is difficult so * just be safe and disable write @@ -1855,8 +1855,16 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, newpmd = pmd_swp_mksoft_dirty(newpmd); if (pmd_swp_uffd_wp(*pmd)) newpmd = pmd_swp_mkuffd_wp(newpmd); - set_pmd_at(mm, addr, pmd, newpmd); + } else { + newpmd = *pmd; } + + if (uffd_wp) + newpmd = pmd_swp_mkuffd_wp(newpmd); + else if (uffd_wp_resolve) + newpmd = pmd_swp_clear_uffd_wp(newpmd); + if (!pmd_same(*pmd, newpmd)) + set_pmd_at(mm, addr, pmd, newpmd); goto unlock; } #endif @@ -3251,6 +3259,8 @@ int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, pmdswp = swp_entry_to_pmd(entry); if (pmd_soft_dirty(pmdval)) pmdswp = pmd_swp_mksoft_dirty(pmdswp); + if (pmd_uffd_wp(pmdval)) + pmdswp = pmd_swp_mkuffd_wp(pmdswp); set_pmd_at(mm, address, pvmw->pmd, pmdswp); page_remove_rmap(page, vma, true); put_page(page); From dd47ac428c3f5f3bcabe845f36be870fe6c20784 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 5 Apr 2023 11:51:20 -0400 Subject: [PATCH 108/175] mm/khugepaged: check again on anon uffd-wp during isolation Khugepaged collapse an anonymous thp in two rounds of scans. The 2nd round done in __collapse_huge_page_isolate() after hpage_collapse_scan_pmd(), during which all the locks will be released temporarily. It means the pgtable can change during this phase before 2nd round starts. It's logically possible some ptes got wr-protected during this phase, and we can errornously collapse a thp without noticing some ptes are wr-protected by userfault. e1e267c7928f wanted to avoid it but it only did that for the 1st phase, not the 2nd phase. Since __collapse_huge_page_isolate() happens after a round of small page swapins, we don't need to worry on any !present ptes - if it existed khugepaged will already bail out. So we only need to check present ptes with uffd-wp bit set there. This is something I found only but never had a reproducer, I thought it was one caused a bug in Muhammad's recent pagemap new ioctl work, but it turns out it's not the cause of that but an userspace bug. However this seems to still be a real bug even with a very small race window, still worth to have it fixed and copy stable. Link: https://lkml.kernel.org/r/20230405155120.3608140-1-peterx@redhat.com Fixes: e1e267c7928f ("khugepaged: skip collapse if uffd-wp detected") Signed-off-by: Peter Xu Reviewed-by: David Hildenbrand Reviewed-by: Yang Shi Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: Mike Rapoport Cc: Nadav Amit Cc: Signed-off-by: Andrew Morton --- mm/khugepaged.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 92e6f56a932d..0ec69b96b497 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -572,6 +572,10 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, result = SCAN_PTE_NON_PRESENT; goto out; } + if (pte_uffd_wp(pteval)) { + result = SCAN_PTE_UFFD_WP; + goto out; + } page = vm_normal_page(vma, address, pteval); if (unlikely(!page) || unlikely(is_zone_device_page(page))) { result = SCAN_PAGE_NULL; From 82f951340f25bba262766f82caec54e7fd6a73c7 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Thu, 6 Apr 2023 15:30:50 -0400 Subject: [PATCH 109/175] mm/mprotect: fix do_mprotect_pkey() return on error When the loop over the VMA is terminated early due to an error, the return code could be overwritten with ENOMEM. Fix the return code by only setting the error on early loop termination when the error is not set. User-visible effects include: attempts to run mprotect() against a special mapping or with a poorly-aligned hugetlb address should return -EINVAL, but they presently return -ENOMEM. In other cases an -EACCESS should be returned. Link: https://lkml.kernel.org/r/20230406193050.1363476-1-Liam.Howlett@oracle.com Fixes: 2286a6914c77 ("mm: change mprotect_fixup to vma iterator") Signed-off-by: Liam R. Howlett Cc: Signed-off-by: Andrew Morton --- mm/mprotect.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mprotect.c b/mm/mprotect.c index 13e84d8c0797..36351a00c0e8 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -838,7 +838,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, } tlb_finish_mmu(&tlb); - if (vma_iter_end(&vmi) < end) + if (!error && vma_iter_end(&vmi) < end) error = -ENOMEM; out: From 4737edbbdd4958ae29ca6a310a6a2fa4e0684b01 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 6 Apr 2023 17:20:04 +0900 Subject: [PATCH 110/175] mm/huge_memory.c: warn with pr_warn_ratelimited instead of VM_WARN_ON_ONCE_FOLIO split_huge_page_to_list() WARNs when called for huge zero pages, which sounds to me too harsh because it does not imply a kernel bug, but just notifies the event to admins. On the other hand, this is considered as critical by syzkaller and makes its testing less efficient, which seems to me harmful. So replace the VM_WARN_ON_ONCE_FOLIO with pr_warn_ratelimited. Link: https://lkml.kernel.org/r/20230406082004.2185420-1-naoya.horiguchi@linux.dev Fixes: 478d134e9506 ("mm/huge_memory: do not overkill when splitting huge_zero_page") Signed-off-by: Naoya Horiguchi Reported-by: syzbot+07a218429c8d19b1fb25@syzkaller.appspotmail.com Link: https://lore.kernel.org/lkml/000000000000a6f34a05e6efcd01@google.com/ Reviewed-by: Yang Shi Cc: Miaohe Lin Cc: Tetsuo Handa Cc: Xu Yu Cc: Signed-off-by: Andrew Morton --- mm/huge_memory.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e3706a2b34b2..3fae2d2496ab 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2665,9 +2665,10 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); is_hzp = is_huge_zero_page(&folio->page); - VM_WARN_ON_ONCE_FOLIO(is_hzp, folio); - if (is_hzp) + if (is_hzp) { + pr_warn_ratelimited("Called split_huge_page for huge zero page\n"); return -EBUSY; + } if (folio_test_writeback(folio)) return -EBUSY; From f4e9e0e69468583c2c6d9d5c7bfc975e292bf188 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Mon, 10 Apr 2023 11:22:05 -0400 Subject: [PATCH 111/175] mm/mempolicy: fix use-after-free of VMA iterator set_mempolicy_home_node() iterates over a list of VMAs and calls mbind_range() on each VMA, which also iterates over the singular list of the VMA passed in and potentially splits the VMA. Since the VMA iterator is not passed through, set_mempolicy_home_node() may now point to a stale node in the VMA tree. This can result in a UAF as reported by syzbot. Avoid the stale maple tree node by passing the VMA iterator through to the underlying call to split_vma(). mbind_range() is also overly complicated, since there are two calling functions and one already handles iterating over the VMAs. Simplify mbind_range() to only handle merging and splitting of the VMAs. Align the new loop in do_mbind() and existing loop in set_mempolicy_home_node() to use the reduced mbind_range() function. This allows for a single location of the range calculation and avoids constantly looking up the previous VMA (since this is a loop over the VMAs). Link: https://lore.kernel.org/linux-mm/000000000000c93feb05f87e24ad@google.com/ Fixes: 66850be55e8e ("mm/mempolicy: use vma iterator & maple state instead of vma linked list") Signed-off-by: Liam R. Howlett Reported-by: syzbot+a7c1ec5b1d71ceaa5186@syzkaller.appspotmail.com Link: https://lkml.kernel.org/r/20230410152205.2294819-1-Liam.Howlett@oracle.com Tested-by: syzbot+a7c1ec5b1d71ceaa5186@syzkaller.appspotmail.com Cc: Signed-off-by: Andrew Morton --- mm/mempolicy.c | 102 +++++++++++++++++++++++-------------------------- 1 file changed, 48 insertions(+), 54 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index a256a241fd1d..2068b594dc88 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -790,61 +790,50 @@ static int vma_replace_policy(struct vm_area_struct *vma, return err; } -/* Step 2: apply policy to a range and do splits. */ -static int mbind_range(struct mm_struct *mm, unsigned long start, - unsigned long end, struct mempolicy *new_pol) +/* Split or merge the VMA (if required) and apply the new policy */ +static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma, + struct vm_area_struct **prev, unsigned long start, + unsigned long end, struct mempolicy *new_pol) { - VMA_ITERATOR(vmi, mm, start); - struct vm_area_struct *prev; - struct vm_area_struct *vma; - int err = 0; + struct vm_area_struct *merged; + unsigned long vmstart, vmend; pgoff_t pgoff; + int err; - prev = vma_prev(&vmi); - vma = vma_find(&vmi, end); - if (WARN_ON(!vma)) + vmend = min(end, vma->vm_end); + if (start > vma->vm_start) { + *prev = vma; + vmstart = start; + } else { + vmstart = vma->vm_start; + } + + if (mpol_equal(vma_policy(vma), new_pol)) return 0; - if (start > vma->vm_start) - prev = vma; + pgoff = vma->vm_pgoff + ((vmstart - vma->vm_start) >> PAGE_SHIFT); + merged = vma_merge(vmi, vma->vm_mm, *prev, vmstart, vmend, vma->vm_flags, + vma->anon_vma, vma->vm_file, pgoff, new_pol, + vma->vm_userfaultfd_ctx, anon_vma_name(vma)); + if (merged) { + *prev = merged; + return vma_replace_policy(merged, new_pol); + } - do { - unsigned long vmstart = max(start, vma->vm_start); - unsigned long vmend = min(end, vma->vm_end); - - if (mpol_equal(vma_policy(vma), new_pol)) - goto next; - - pgoff = vma->vm_pgoff + - ((vmstart - vma->vm_start) >> PAGE_SHIFT); - prev = vma_merge(&vmi, mm, prev, vmstart, vmend, vma->vm_flags, - vma->anon_vma, vma->vm_file, pgoff, - new_pol, vma->vm_userfaultfd_ctx, - anon_vma_name(vma)); - if (prev) { - vma = prev; - goto replace; - } - if (vma->vm_start != vmstart) { - err = split_vma(&vmi, vma, vmstart, 1); - if (err) - goto out; - } - if (vma->vm_end != vmend) { - err = split_vma(&vmi, vma, vmend, 0); - if (err) - goto out; - } -replace: - err = vma_replace_policy(vma, new_pol); + if (vma->vm_start != vmstart) { + err = split_vma(vmi, vma, vmstart, 1); if (err) - goto out; -next: - prev = vma; - } for_each_vma_range(vmi, vma, end); + return err; + } -out: - return err; + if (vma->vm_end != vmend) { + err = split_vma(vmi, vma, vmend, 0); + if (err) + return err; + } + + *prev = vma; + return vma_replace_policy(vma, new_pol); } /* Set the process memory policy */ @@ -1259,6 +1248,8 @@ static long do_mbind(unsigned long start, unsigned long len, nodemask_t *nmask, unsigned long flags) { struct mm_struct *mm = current->mm; + struct vm_area_struct *vma, *prev; + struct vma_iterator vmi; struct mempolicy *new; unsigned long end; int err; @@ -1328,7 +1319,13 @@ static long do_mbind(unsigned long start, unsigned long len, goto up_out; } - err = mbind_range(mm, start, end, new); + vma_iter_init(&vmi, mm, start); + prev = vma_prev(&vmi); + for_each_vma_range(vmi, vma, end) { + err = mbind_range(&vmi, vma, &prev, start, end, new); + if (err) + break; + } if (!err) { int nr_failed = 0; @@ -1489,10 +1486,8 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le unsigned long, home_node, unsigned long, flags) { struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; + struct vm_area_struct *vma, *prev; struct mempolicy *new, *old; - unsigned long vmstart; - unsigned long vmend; unsigned long end; int err = -ENOENT; VMA_ITERATOR(vmi, mm, start); @@ -1521,6 +1516,7 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le if (end == start) return 0; mmap_write_lock(mm); + prev = vma_prev(&vmi); for_each_vma_range(vmi, vma, end) { /* * If any vma in the range got policy other than MPOL_BIND @@ -1541,9 +1537,7 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le } new->home_node = home_node; - vmstart = max(start, vma->vm_start); - vmend = min(end, vma->vm_end); - err = mbind_range(mm, vmstart, vmend, new); + err = mbind_range(&vmi, vma, &prev, start, end, new); mpol_put(new); if (err) break; From d2c115baae6f793b5b01cff67799da17ffb1eda5 Mon Sep 17 00:00:00 2001 From: Jonathan Toppins Date: Mon, 10 Apr 2023 17:39:35 -0400 Subject: [PATCH 112/175] mailmap: update jtoppins' entry to reference correct email Link: https://lkml.kernel.org/r/d79bc6eaf65e68bd1c2a1e1510ab6291ce5926a6.1681162487.git.jtoppins@redhat.com Signed-off-by: Jonathan Toppins Cc: Colin Ian King Cc: Jakub Kicinski Cc: Kirill Tkhai Cc: Konrad Dybcio Cc: Qais Yousef Cc: Stephen Hemminger Signed-off-by: Andrew Morton --- .mailmap | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.mailmap b/.mailmap index e42486317d18..86d0760c15eb 100644 --- a/.mailmap +++ b/.mailmap @@ -232,6 +232,8 @@ Johan Hovold John Crispin John Paul Adrian Glaubitz John Stultz + + Jordan Crouse From 9235756885e865070c4be2facda75262dbd85967 Mon Sep 17 00:00:00 2001 From: Steve Chou Date: Tue, 11 Apr 2023 11:49:28 +0800 Subject: [PATCH 113/175] tools/mm/page_owner_sort.c: fix TGID output when cull=tg is used When using cull option with 'tg' flag, the fprintf is using pid instead of tgid. It should use tgid instead. Link: https://lkml.kernel.org/r/20230411034929.2071501-1-steve_chou@pesi.com.tw Fixes: 9c8a0a8e599f4a ("tools/vm/page_owner_sort.c: support for user-defined culling rules") Signed-off-by: Steve Chou Cc: Jiajian Ye Cc: Signed-off-by: Andrew Morton --- tools/mm/page_owner_sort.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/mm/page_owner_sort.c b/tools/mm/page_owner_sort.c index 7c2ac124cdc8..99798894b879 100644 --- a/tools/mm/page_owner_sort.c +++ b/tools/mm/page_owner_sort.c @@ -857,7 +857,7 @@ int main(int argc, char **argv) if (cull & CULL_PID || filter & FILTER_PID) fprintf(fout, ", PID %d", list[i].pid); if (cull & CULL_TGID || filter & FILTER_TGID) - fprintf(fout, ", TGID %d", list[i].pid); + fprintf(fout, ", TGID %d", list[i].tgid); if (cull & CULL_COMM || filter & FILTER_COMM) fprintf(fout, ", task_comm_name: %s", list[i].comm); if (cull & CULL_ALLOCATOR) { From 1f5f12ece722aacea1769fb644f27790ede339dc Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Tue, 11 Apr 2023 12:10:04 +0800 Subject: [PATCH 114/175] maple_tree: fix a potential memory leak, OOB access, or other unpredictable bug In mas_alloc_nodes(), "node->node_count = 0" means to initialize the node_count field of the new node, but the node may not be a new node. It may be a node that existed before and node_count has a value, setting it to 0 will cause a memory leak. At this time, mas->alloc->total will be greater than the actual number of nodes in the linked list, which may cause many other errors. For example, out-of-bounds access in mas_pop_node(), and mas_pop_node() may return addresses that should not be used. Fix it by initializing node_count only for new nodes. Also, by the way, an if-else statement was removed to simplify the code. Link: https://lkml.kernel.org/r/20230411041005.26205-1-zhangpeng.00@bytedance.com Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Cc: Signed-off-by: Andrew Morton --- lib/maple_tree.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index db60edb55f2f..7ff2a821a2a1 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1303,26 +1303,21 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) node = mas->alloc; node->request_count = 0; while (requested) { - max_req = MAPLE_ALLOC_SLOTS; - if (node->node_count) { - unsigned int offset = node->node_count; - - slots = (void **)&node->slot[offset]; - max_req -= offset; - } else { - slots = (void **)&node->slot; - } - + max_req = MAPLE_ALLOC_SLOTS - node->node_count; + slots = (void **)&node->slot[node->node_count]; max_req = min(requested, max_req); count = mt_alloc_bulk(gfp, max_req, slots); if (!count) goto nomem_bulk; + if (node->node_count == 0) { + node->slot[0]->node_count = 0; + node->slot[0]->request_count = 0; + } + node->node_count += count; allocated += count; node = node->slot[0]; - node->node_count = 0; - node->request_count = 0; requested -= count; } mas->alloc->total = allocated; From 1ba1199ec5747f475538c0d25a32804e5ba1dfde Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Mon, 10 Apr 2023 21:08:26 +0800 Subject: [PATCH 115/175] writeback, cgroup: fix null-ptr-deref write in bdi_split_work_to_wbs KASAN report null-ptr-deref: ================================================================== BUG: KASAN: null-ptr-deref in bdi_split_work_to_wbs+0x5c5/0x7b0 Write of size 8 at addr 0000000000000000 by task sync/943 CPU: 5 PID: 943 Comm: sync Tainted: 6.3.0-rc5-next-20230406-dirty #461 Call Trace: dump_stack_lvl+0x7f/0xc0 print_report+0x2ba/0x340 kasan_report+0xc4/0x120 kasan_check_range+0x1b7/0x2e0 __kasan_check_write+0x24/0x40 bdi_split_work_to_wbs+0x5c5/0x7b0 sync_inodes_sb+0x195/0x630 sync_inodes_one_sb+0x3a/0x50 iterate_supers+0x106/0x1b0 ksys_sync+0x98/0x160 [...] ================================================================== The race that causes the above issue is as follows: cpu1 cpu2 -------------------------|------------------------- inode_switch_wbs INIT_WORK(&isw->work, inode_switch_wbs_work_fn) queue_rcu_work(isw_wq, &isw->work) // queue_work async inode_switch_wbs_work_fn wb_put_many(old_wb, nr_switched) percpu_ref_put_many ref->data->release(ref) cgwb_release queue_work(cgwb_release_wq, &wb->release_work) // queue_work async &wb->release_work cgwb_release_workfn ksys_sync iterate_supers sync_inodes_one_sb sync_inodes_sb bdi_split_work_to_wbs kmalloc(sizeof(*work), GFP_ATOMIC) // alloc memory failed percpu_ref_exit ref->data = NULL kfree(data) wb_get(wb) percpu_ref_get(&wb->refcnt) percpu_ref_get_many(ref, 1) atomic_long_add(nr, &ref->data->count) atomic64_add(i, v) // trigger null-ptr-deref bdi_split_work_to_wbs() traverses &bdi->wb_list to split work into all wbs. If the allocation of new work fails, the on-stack fallback will be used and the reference count of the current wb is increased afterwards. If cgroup writeback membership switches occur before getting the reference count and the current wb is released as old_wd, then calling wb_get() or wb_put() will trigger the null pointer dereference above. This issue was introduced in v4.3-rc7 (see fix tag1). Both sync_inodes_sb() and __writeback_inodes_sb_nr() calls to bdi_split_work_to_wbs() can trigger this issue. For scenarios called via sync_inodes_sb(), originally commit 7fc5854f8c6e ("writeback: synchronize sync(2) against cgroup writeback membership switches") reduced the possibility of the issue by adding wb_switch_rwsem, but in v5.14-rc1 (see fix tag2) removed the "inode_io_list_del_locked(inode, old_wb)" from inode_switch_wbs_work_fn() so that wb->state contains WB_has_dirty_io, thus old_wb is not skipped when traversing wbs in bdi_split_work_to_wbs(), and the issue becomes easily reproducible again. To solve this problem, percpu_ref_exit() is called under RCU protection to avoid race between cgwb_release_workfn() and bdi_split_work_to_wbs(). Moreover, replace wb_get() with wb_tryget() in bdi_split_work_to_wbs(), and skip the current wb if wb_tryget() fails because the wb has already been shutdown. Link: https://lkml.kernel.org/r/20230410130826.1492525-1-libaokun1@huawei.com Fixes: b817525a4a80 ("writeback: bdi_writeback iteration must not skip dying ones") Signed-off-by: Baokun Li Reviewed-by: Jan Kara Acked-by: Tejun Heo Cc: Alexander Viro Cc: Andreas Dilger Cc: Christian Brauner Cc: Dennis Zhou Cc: Hou Tao Cc: yangerkun Cc: Zhang Yi Cc: Jens Axboe Cc: Signed-off-by: Andrew Morton --- fs/fs-writeback.c | 17 ++++++++++------- mm/backing-dev.c | 12 ++++++++++-- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 195dc23e0d83..1db3e3c24b43 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -978,6 +978,16 @@ restart: continue; } + /* + * If wb_tryget fails, the wb has been shutdown, skip it. + * + * Pin @wb so that it stays on @bdi->wb_list. This allows + * continuing iteration from @wb after dropping and + * regrabbing rcu read lock. + */ + if (!wb_tryget(wb)) + continue; + /* alloc failed, execute synchronously using on-stack fallback */ work = &fallback_work; *work = *base_work; @@ -986,13 +996,6 @@ restart: work->done = &fallback_work_done; wb_queue_work(wb, work); - - /* - * Pin @wb so that it stays on @bdi->wb_list. This allows - * continuing iteration from @wb after dropping and - * regrabbing rcu read lock. - */ - wb_get(wb); last_wb = wb; rcu_read_unlock(); diff --git a/mm/backing-dev.c b/mm/backing-dev.c index a53b9360b72e..30d2d0386fdb 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -507,6 +507,15 @@ static LIST_HEAD(offline_cgwbs); static void cleanup_offline_cgwbs_workfn(struct work_struct *work); static DECLARE_WORK(cleanup_offline_cgwbs_work, cleanup_offline_cgwbs_workfn); +static void cgwb_free_rcu(struct rcu_head *rcu_head) +{ + struct bdi_writeback *wb = container_of(rcu_head, + struct bdi_writeback, rcu); + + percpu_ref_exit(&wb->refcnt); + kfree(wb); +} + static void cgwb_release_workfn(struct work_struct *work) { struct bdi_writeback *wb = container_of(work, struct bdi_writeback, @@ -529,11 +538,10 @@ static void cgwb_release_workfn(struct work_struct *work) list_del(&wb->offline_node); spin_unlock_irq(&cgwb_lock); - percpu_ref_exit(&wb->refcnt); wb_exit(wb); bdi_put(bdi); WARN_ON_ONCE(!list_empty(&wb->b_attached)); - kfree_rcu(wb, rcu); + call_rcu(&wb->rcu, cgwb_free_rcu); } static void cgwb_release(struct percpu_ref *refcnt) From 2ff559f31a5d50c31a3f9d849f8af90dc36c7105 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 12 Apr 2023 12:38:52 -0400 Subject: [PATCH 116/175] Revert "userfaultfd: don't fail on unrecognized features" This is a proposal to revert commit 914eedcb9ba0ff53c33808. I found this when writing a simple UFFDIO_API test to be the first unit test in this set. Two things breaks with the commit: - UFFDIO_API check was lost and missing. According to man page, the kernel should reject ioctl(UFFDIO_API) if uffdio_api.api != 0xaa. This check is needed if the api version will be extended in the future, or user app won't be able to identify which is a new kernel. - Feature flags checks were removed, which means UFFDIO_API with a feature that does not exist will also succeed. According to the man page, we should (and it makes sense) to reject ioctl(UFFDIO_API) if unknown features passed in. Link: https://lore.kernel.org/r/20220722201513.1624158-1-axelrasmussen@google.com Link: https://lkml.kernel.org/r/20230412163922.327282-2-peterx@redhat.com Fixes: 914eedcb9ba0 ("userfaultfd: don't fail on unrecognized features") Signed-off-by: Peter Xu Acked-by: David Hildenbrand Cc: Axel Rasmussen Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Zach O'Keefe Cc: Signed-off-by: Andrew Morton --- fs/userfaultfd.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 44d1ee429eb0..40f9e1a2ebdd 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1955,8 +1955,10 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, ret = -EFAULT; if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api))) goto out; - /* Ignore unsupported features (userspace built against newer kernel) */ - features = uffdio_api.features & UFFD_API_FEATURES; + features = uffdio_api.features; + ret = -EINVAL; + if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES)) + goto err_out; ret = -EPERM; if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE)) goto err_out; From 6a8f57ae2eb07ab39a6f0ccad60c760743051026 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 16 Apr 2023 15:23:53 -0700 Subject: [PATCH 117/175] Linux 6.3-rc7 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 5aeea3d98fc0..b5c48e3c935a 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 3 SUBLEVEL = 0 -EXTRAVERSION = -rc6 +EXTRAVERSION = -rc7 NAME = Hurr durr I'ma ninja sloth # *DOCUMENTATION* From 853618d5886bf94812f31228091cd37d308230f7 Mon Sep 17 00:00:00 2001 From: Xuan Zhuo Date: Fri, 14 Apr 2023 14:08:35 +0800 Subject: [PATCH 118/175] virtio_net: bugfix overflow inside xdp_linearize_page() Here we copy the data from the original buf to the new page. But we not check that it may be overflow. As long as the size received(including vnethdr) is greater than 3840 (PAGE_SIZE -VIRTIO_XDP_HEADROOM). Then the memcpy will overflow. And this is completely possible, as long as the MTU is large, such as 4096. In our test environment, this will cause crash. Since crash is caused by the written memory, it is meaningless, so I do not include it. Fixes: 72979a6c3590 ("virtio_net: xdp, add slowpath case for non contiguous buffers") Signed-off-by: Xuan Zhuo Acked-by: Jason Wang Acked-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 2396c28c0122..ea1bd4bb326d 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -814,8 +814,13 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, int page_off, unsigned int *len) { - struct page *page = alloc_page(GFP_ATOMIC); + int tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + struct page *page; + if (page_off + *len + tailroom > PAGE_SIZE) + return NULL; + + page = alloc_page(GFP_ATOMIC); if (!page) return NULL; @@ -823,7 +828,6 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, page_off += *len; while (--*num_buf) { - int tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); unsigned int buflen; void *buf; int off; From a80bb8e7233b2ad6ff119646b6e33fb3edcec37b Mon Sep 17 00:00:00 2001 From: Ding Hui Date: Fri, 14 Apr 2023 23:23:06 +0800 Subject: [PATCH 119/175] sfc: Fix use-after-free due to selftest_work There is a use-after-free scenario that is: When the NIC is down, user set mac address or vlan tag to VF, the xxx_set_vf_mac() or xxx_set_vf_vlan() will invoke efx_net_stop() and efx_net_open(), since netif_running() is false, the port will not start and keep port_enabled false, but selftest_work is scheduled in efx_net_open(). If we remove the device before selftest_work run, the efx_stop_port() will not be called since the NIC is down, and then efx is freed, we will soon get a UAF in run_timer_softirq() like this: [ 1178.907941] ================================================================== [ 1178.907948] BUG: KASAN: use-after-free in run_timer_softirq+0xdea/0xe90 [ 1178.907950] Write of size 8 at addr ff11001f449cdc80 by task swapper/47/0 [ 1178.907950] [ 1178.907953] CPU: 47 PID: 0 Comm: swapper/47 Kdump: loaded Tainted: G O --------- -t - 4.18.0 #1 [ 1178.907954] Hardware name: SANGFOR X620G40/WI2HG-208T1061A, BIOS SPYH051032-U01 04/01/2022 [ 1178.907955] Call Trace: [ 1178.907956] [ 1178.907960] dump_stack+0x71/0xab [ 1178.907963] print_address_description+0x6b/0x290 [ 1178.907965] ? run_timer_softirq+0xdea/0xe90 [ 1178.907967] kasan_report+0x14a/0x2b0 [ 1178.907968] run_timer_softirq+0xdea/0xe90 [ 1178.907971] ? init_timer_key+0x170/0x170 [ 1178.907973] ? hrtimer_cancel+0x20/0x20 [ 1178.907976] ? sched_clock+0x5/0x10 [ 1178.907978] ? sched_clock_cpu+0x18/0x170 [ 1178.907981] __do_softirq+0x1c8/0x5fa [ 1178.907985] irq_exit+0x213/0x240 [ 1178.907987] smp_apic_timer_interrupt+0xd0/0x330 [ 1178.907989] apic_timer_interrupt+0xf/0x20 [ 1178.907990] [ 1178.907991] RIP: 0010:mwait_idle+0xae/0x370 If the NIC is not actually brought up, there is no need to schedule selftest_work, so let's move invoking efx_selftest_async_start() into efx_start_all(), and it will be canceled by broughting down. Fixes: dd40781e3a4e ("sfc: Run event/IRQ self-test asynchronously when interface is brought up") Fixes: e340be923012 ("sfc: add ndo_set_vf_mac() function for EF10") Debugged-by: Huang Cun Cc: Donglin Peng Suggested-by: Martin Habets Signed-off-by: Ding Hui Signed-off-by: David S. Miller --- drivers/net/ethernet/sfc/efx.c | 1 - drivers/net/ethernet/sfc/efx_common.c | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c index 884d8d168862..1eceffa02b55 100644 --- a/drivers/net/ethernet/sfc/efx.c +++ b/drivers/net/ethernet/sfc/efx.c @@ -541,7 +541,6 @@ int efx_net_open(struct net_device *net_dev) else efx->state = STATE_NET_UP; - efx_selftest_async_start(efx); return 0; } diff --git a/drivers/net/ethernet/sfc/efx_common.c b/drivers/net/ethernet/sfc/efx_common.c index cc30524c2fe4..361687de308d 100644 --- a/drivers/net/ethernet/sfc/efx_common.c +++ b/drivers/net/ethernet/sfc/efx_common.c @@ -544,6 +544,8 @@ void efx_start_all(struct efx_nic *efx) /* Start the hardware monitor if there is one */ efx_start_monitor(efx); + efx_selftest_async_start(efx); + /* Link state detection is normally event-driven; we have * to poll now because we could have missed a change */ From 338469d677e5d426f5ada88761f16f6d2c7c1981 Mon Sep 17 00:00:00 2001 From: Pedro Tammela Date: Sat, 15 Apr 2023 12:33:09 -0300 Subject: [PATCH 120/175] net/sched: clear actions pointer in miss cookie init fail Palash reports a UAF when using a modified version of syzkaller[1]. When 'tcf_exts_miss_cookie_base_alloc()' fails in 'tcf_exts_init_ex()' a call to 'tcf_exts_destroy()' is made to free up the tcf_exts resources. In flower, a call to '__fl_put()' when 'tcf_exts_init_ex()' fails is made; Then calling 'tcf_exts_destroy()', which triggers an UAF since the already freed tcf_exts action pointer is lingering in the struct. Before the offending patch, this was not an issue since there was no case where the tcf_exts action pointer could linger. Therefore, restore the old semantic by clearing the action pointer in case of a failure to initialize the miss_cookie. [1] https://github.com/cmu-pasta/linux-kernel-enriched-corpus v1->v2: Fix compilation on configs without tc actions (kernel test robot) Fixes: 80cd22c35c90 ("net/sched: cls_api: Support hardware miss to tc action") Reported-by: Palash Oswal Acked-by: Jamal Hadi Salim Signed-off-by: Pedro Tammela Signed-off-by: David S. Miller --- net/sched/cls_api.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 2a6b6be0811b..35785a36c802 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -3235,6 +3235,9 @@ int tcf_exts_init_ex(struct tcf_exts *exts, struct net *net, int action, err_miss_alloc: tcf_exts_destroy(exts); +#ifdef CONFIG_NET_CLS_ACT + exts->actions = NULL; +#endif return err; } EXPORT_SYMBOL(tcf_exts_init_ex); From c55c0e91c813589dc55bea6bf9a9fbfaa10ae41d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 17 Apr 2023 10:21:36 +0200 Subject: [PATCH 121/175] netfilter: nf_tables: fix ifdef to also consider nf_tables=m nftables can be built as a module, so fix the preprocessor conditional accordingly. Fixes: 478b360a47b7 ("netfilter: nf_tables: fix nf_trace always-on with XT_TRACE=n") Reported-by: Florian Fainelli Reported-by: Jakub Kicinski Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/skbuff.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 1ec3530c8191..dbcaac8b6966 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4713,7 +4713,7 @@ static inline void nf_reset_ct(struct sk_buff *skb) static inline void nf_reset_trace(struct sk_buff *skb) { -#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || defined(CONFIG_NF_TABLES) +#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || IS_ENABLED(CONFIG_NF_TABLES) skb->nf_trace = 0; #endif } @@ -4733,7 +4733,7 @@ static inline void __nf_copy(struct sk_buff *dst, const struct sk_buff *src, dst->_nfct = src->_nfct; nf_conntrack_get(skb_nfct(src)); #endif -#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || defined(CONFIG_NF_TABLES) +#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || IS_ENABLED(CONFIG_NF_TABLES) if (copy) dst->nf_trace = src->nf_trace; #endif From d51425190805d47aecc1910b272e65476dd3b937 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 16 Apr 2023 13:05:06 -0400 Subject: [PATCH 122/175] SUNRPC: Fix failures of checksum Kunit tests Scott reports that when the new GSS krb5 Kunit tests are built as a separate module and loaded, the RFC 6803 and RFC 8009 checksum tests all fail, even though they pass when run under kunit.py. It appears that passing a buffer backed by static const memory to gss_krb5_checksum() is a problem. A printk in checksum_case() shows the correct plaintext, but by the time the buffer has been converted to a scatterlist and arrives at checksummer(), it contains all zeroes. Replacing this buffer with one that is dynamically allocated fixes the issue. Reported-by: Scott Mayhew Fixes: 02142b2ca8fc ("SUNRPC: Add checksum KUnit tests for the RFC 6803 encryption types") Tested-by: Scott Mayhew Signed-off-by: Chuck Lever --- net/sunrpc/auth_gss/gss_krb5_test.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/sunrpc/auth_gss/gss_krb5_test.c b/net/sunrpc/auth_gss/gss_krb5_test.c index aa6ec4e858aa..95ca783795c5 100644 --- a/net/sunrpc/auth_gss/gss_krb5_test.c +++ b/net/sunrpc/auth_gss/gss_krb5_test.c @@ -73,7 +73,6 @@ static void checksum_case(struct kunit *test) { const struct gss_krb5_test_param *param = test->param_value; struct xdr_buf buf = { - .head[0].iov_base = param->plaintext->data, .head[0].iov_len = param->plaintext->len, .len = param->plaintext->len, }; @@ -99,6 +98,10 @@ static void checksum_case(struct kunit *test) err = crypto_ahash_setkey(tfm, Kc.data, Kc.len); KUNIT_ASSERT_EQ(test, err, 0); + buf.head[0].iov_base = kunit_kzalloc(test, buf.head[0].iov_len, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buf.head[0].iov_base); + memcpy(buf.head[0].iov_base, param->plaintext->data, buf.head[0].iov_len); + checksum.len = gk5e->cksumlength; checksum.data = kunit_kzalloc(test, checksum.len, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, checksum.data); From 8485d093b076e59baff424552e8aecfc5bd2d261 Mon Sep 17 00:00:00 2001 From: Aleksandr Loktionov Date: Fri, 24 Mar 2023 18:16:38 +0100 Subject: [PATCH 123/175] i40e: fix accessing vsi->active_filters without holding lock Fix accessing vsi->active_filters without holding the mac_filter_hash_lock. Move vsi->active_filters = 0 inside critical section and move clear_bit(__I40E_VSI_OVERFLOW_PROMISC, vsi->state) after the critical section to ensure the new filters from other threads can be added only after filters cleaning in the critical section is finished. Fixes: 278e7d0b9d68 ("i40e: store MAC/VLAN filters in a hash with the MAC Address as key") Signed-off-by: Aleksandr Loktionov Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 228cd502bb48..6313575a4b6c 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -14133,15 +14133,15 @@ static int i40e_add_vsi(struct i40e_vsi *vsi) vsi->id = ctxt.vsi_number; } - vsi->active_filters = 0; - clear_bit(__I40E_VSI_OVERFLOW_PROMISC, vsi->state); spin_lock_bh(&vsi->mac_filter_hash_lock); + vsi->active_filters = 0; /* If macvlan filters already exist, force them to get loaded */ hash_for_each_safe(vsi->mac_filter_hash, bkt, h, f, hlist) { f->state = I40E_FILTER_NEW; f_count++; } spin_unlock_bh(&vsi->mac_filter_hash_lock); + clear_bit(__I40E_VSI_OVERFLOW_PROMISC, vsi->state); if (f_count) { vsi->flags |= I40E_VSI_FLAG_FILTER_CHANGED; From c86c00c6935505929cc9adb29ddb85e48c71f828 Mon Sep 17 00:00:00 2001 From: Aleksandr Loktionov Date: Mon, 3 Apr 2023 07:13:18 +0200 Subject: [PATCH 124/175] i40e: fix i40e_setup_misc_vector() error handling Add error handling of i40e_setup_misc_vector() in i40e_rebuild(). In case interrupt vectors setup fails do not re-open vsi-s and do not bring up vf-s, we have no interrupts to serve a traffic anyway. Fixes: 41c445ff0f48 ("i40e: main driver core") Signed-off-by: Aleksandr Loktionov Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_main.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 6313575a4b6c..7c30abd0dfc2 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -11059,8 +11059,11 @@ static void i40e_rebuild(struct i40e_pf *pf, bool reinit, bool lock_acquired) pf->hw.aq.asq_last_status)); } /* reinit the misc interrupt */ - if (pf->flags & I40E_FLAG_MSIX_ENABLED) + if (pf->flags & I40E_FLAG_MSIX_ENABLED) { ret = i40e_setup_misc_vector(pf); + if (ret) + goto end_unlock; + } /* Add a filter to drop all Flow control frames from any VSI from being * transmitted. By doing so we stop a malicious VF from sending out From 1a2bd3bd72e978304cdc0a7385e8048e8242225d Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Fri, 14 Apr 2023 09:26:14 -0700 Subject: [PATCH 125/175] ice: document RDMA devlink parameters Commit e523af4ee560 ("net/ice: Add support for enable_iwarp and enable_roce devlink param") added support for the enable_roce and enable_iwarp parameters in the ice driver. It didn't document these parameters in the ice devlink documentation file. Add this documentation, including a note about the mutual exclusion between the two modes. Signed-off-by: Jacob Keller Reviewed-by: Leon Romanovsky Acked-by: Tony Nguyen Link: https://lore.kernel.org/r/20230414162614.571861-1-jacob.e.keller@intel.com Signed-off-by: Jakub Kicinski --- Documentation/networking/devlink/ice.rst | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/Documentation/networking/devlink/ice.rst b/Documentation/networking/devlink/ice.rst index 10f282c2117c..2f60e34ab926 100644 --- a/Documentation/networking/devlink/ice.rst +++ b/Documentation/networking/devlink/ice.rst @@ -7,6 +7,21 @@ ice devlink support This document describes the devlink features implemented by the ``ice`` device driver. +Parameters +========== + +.. list-table:: Generic parameters implemented + + * - Name + - Mode + - Notes + * - ``enable_roce`` + - runtime + - mutually exclusive with ``enable_iwarp`` + * - ``enable_iwarp`` + - runtime + - mutually exclusive with ``enable_roce`` + Info versions ============= From d46fc894147cf98dd6e8210aa99ed46854191840 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 17 Apr 2023 12:14:29 +0200 Subject: [PATCH 126/175] netfilter: nf_tables: validate catch-all set elements catch-all set element might jump/goto to chain that uses expressions that require validation. Fixes: aaa31047a6d2 ("netfilter: nftables: add catch-all set element support") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 4 ++ net/netfilter/nf_tables_api.c | 64 ++++++++++++++++++++++++++++--- net/netfilter/nft_lookup.c | 36 ++--------------- 3 files changed, 66 insertions(+), 38 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 9430128aae99..1b8e305bb54a 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1085,6 +1085,10 @@ struct nft_chain { }; int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain); +int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set, + const struct nft_set_iter *iter, + struct nft_set_elem *elem); +int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set); enum nft_chain_types { NFT_CHAIN_T_DEFAULT = 0, diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index cd52109e674a..98043e83af71 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3447,6 +3447,64 @@ static int nft_table_validate(struct net *net, const struct nft_table *table) return 0; } +int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set, + const struct nft_set_iter *iter, + struct nft_set_elem *elem) +{ + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + struct nft_ctx *pctx = (struct nft_ctx *)ctx; + const struct nft_data *data; + int err; + + if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) && + *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END) + return 0; + + data = nft_set_ext_data(ext); + switch (data->verdict.code) { + case NFT_JUMP: + case NFT_GOTO: + pctx->level++; + err = nft_chain_validate(ctx, data->verdict.chain); + if (err < 0) + return err; + pctx->level--; + break; + default: + break; + } + + return 0; +} + +struct nft_set_elem_catchall { + struct list_head list; + struct rcu_head rcu; + void *elem; +}; + +int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set) +{ + u8 genmask = nft_genmask_next(ctx->net); + struct nft_set_elem_catchall *catchall; + struct nft_set_elem elem; + struct nft_set_ext *ext; + int ret = 0; + + list_for_each_entry_rcu(catchall, &set->catchall_list, list) { + ext = nft_set_elem_ext(set, catchall->elem); + if (!nft_set_elem_active(ext, genmask)) + continue; + + elem.priv = catchall->elem; + ret = nft_setelem_validate(ctx, set, NULL, &elem); + if (ret < 0) + return ret; + } + + return ret; +} + static struct nft_rule *nft_rule_lookup_byid(const struct net *net, const struct nft_chain *chain, const struct nlattr *nla); @@ -4759,12 +4817,6 @@ err_set_name: return err; } -struct nft_set_elem_catchall { - struct list_head list; - struct rcu_head rcu; - void *elem; -}; - static void nft_set_catchall_destroy(const struct nft_ctx *ctx, struct nft_set *set) { diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index cae5a6724163..cecf8ab90e58 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -199,37 +199,6 @@ nla_put_failure: return -1; } -static int nft_lookup_validate_setelem(const struct nft_ctx *ctx, - struct nft_set *set, - const struct nft_set_iter *iter, - struct nft_set_elem *elem) -{ - const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); - struct nft_ctx *pctx = (struct nft_ctx *)ctx; - const struct nft_data *data; - int err; - - if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) && - *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END) - return 0; - - data = nft_set_ext_data(ext); - switch (data->verdict.code) { - case NFT_JUMP: - case NFT_GOTO: - pctx->level++; - err = nft_chain_validate(ctx, data->verdict.chain); - if (err < 0) - return err; - pctx->level--; - break; - default: - break; - } - - return 0; -} - static int nft_lookup_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nft_data **d) @@ -245,9 +214,12 @@ static int nft_lookup_validate(const struct nft_ctx *ctx, iter.skip = 0; iter.count = 0; iter.err = 0; - iter.fn = nft_lookup_validate_setelem; + iter.fn = nft_setelem_validate; priv->set->ops->walk(ctx, priv->set, &iter); + if (!iter.err) + iter.err = nft_set_catchall_validate(ctx, priv->set); + if (iter.err < 0) return iter.err; From e50b9b9e8610d47b7c22529443e45a16b1ea3a15 Mon Sep 17 00:00:00 2001 From: Duoming Zhou Date: Sat, 15 Apr 2023 16:12:27 +0800 Subject: [PATCH 127/175] cxgb4: fix use after free bugs caused by circular dependency problem The flower_stats_timer can schedule flower_stats_work and flower_stats_work can also arm the flower_stats_timer. The process is shown below: ----------- timer schedules work ------------ ch_flower_stats_cb() //timer handler schedule_work(&adap->flower_stats_work); ----------- work arms timer ------------ ch_flower_stats_handler() //workqueue callback function mod_timer(&adap->flower_stats_timer, ...); When the cxgb4 device is detaching, the timer and workqueue could still be rearmed. The process is shown below: (cleanup routine) | (timer and workqueue routine) remove_one() | free_some_resources() | ch_flower_stats_cb() //timer cxgb4_cleanup_tc_flower() | schedule_work() del_timer_sync() | | ch_flower_stats_handler() //workqueue | mod_timer() cancel_work_sync() | kfree(adapter) //FREE | ch_flower_stats_cb() //timer | adap->flower_stats_work //USE This patch changes del_timer_sync() to timer_shutdown_sync(), which could prevent rearming of the timer from the workqueue. Fixes: e0f911c81e93 ("cxgb4: fetch stats for offloaded tc flower flows") Signed-off-by: Duoming Zhou Link: https://lore.kernel.org/r/20230415081227.7463-1-duoming@zju.edu.cn Signed-off-by: Paolo Abeni --- drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c index dd9be229819a..d3541159487d 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c @@ -1135,7 +1135,7 @@ void cxgb4_cleanup_tc_flower(struct adapter *adap) return; if (adap->flower_stats_timer.function) - del_timer_sync(&adap->flower_stats_timer); + timer_shutdown_sync(&adap->flower_stats_timer); cancel_work_sync(&adap->flower_stats_work); rhashtable_destroy(&adap->flower_tbl); adap->tc_flower_initialized = false; From d4eb7e39929a3b1ff30fb751b4859fc2410702a0 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 17 Apr 2023 17:50:28 +0200 Subject: [PATCH 128/175] netfilter: nf_tables: tighten netlink attribute requirements for catch-all elements If NFT_SET_ELEM_CATCHALL is set on, then userspace provides no set element key. Otherwise, bail out with -EINVAL. Fixes: aaa31047a6d2 ("netfilter: nftables: add catch-all set element support") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 98043e83af71..e48ab8dfb541 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6108,7 +6108,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (err < 0) return err; - if (!nla[NFTA_SET_ELEM_KEY] && !(flags & NFT_SET_ELEM_CATCHALL)) + if (((flags & NFT_SET_ELEM_CATCHALL) && nla[NFTA_SET_ELEM_KEY]) || + (!(flags & NFT_SET_ELEM_CATCHALL) && !nla[NFTA_SET_ELEM_KEY])) return -EINVAL; if (flags != 0) { From e8b51a1a15d5a3cce231e0669f6a161dc5bb9b75 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Sun, 16 Apr 2023 23:58:18 -0700 Subject: [PATCH 129/175] bnxt_en: Do not initialize PTP on older P3/P4 chips The driver does not support PTP on these older chips and it is assuming that firmware on these older chips will not return the PORT_MAC_PTP_QCFG_RESP_FLAGS_HWRM_ACCESS flag in __bnxt_hwrm_ptp_qcfg(), causing the function to abort quietly. But newer firmware now sets this flag and so __bnxt_hwrm_ptp_qcfg() will proceed further. Eventually it will fail in bnxt_ptp_init() -> bnxt_map_ptp_regs() because there is no code to support the older chips. The driver will then complain: "PTP initialization failed.\n" Fix it so that we abort quietly earlier without going through the unnecessary steps and alarming the user with the warning log. Fixes: ae5c42f0b92c ("bnxt_en: Get PTP hardware capability from firmware") Signed-off-by: Michael Chan Signed-off-by: Paolo Abeni --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index c23e3b397bcf..ef97a4190b39 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -7627,7 +7627,7 @@ static int __bnxt_hwrm_ptp_qcfg(struct bnxt *bp) u8 flags; int rc; - if (bp->hwrm_spec_code < 0x10801) { + if (bp->hwrm_spec_code < 0x10801 || !BNXT_CHIP_P5_THOR(bp)) { rc = -ENODEV; goto no_ptp; } From 4f4e54b1041e60694117893cd986831153a3e719 Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Sun, 16 Apr 2023 23:58:19 -0700 Subject: [PATCH 130/175] bnxt_en: Fix a possible NULL pointer dereference in unload path In the driver unload path, the driver currently checks the valid BNXT_FLAG_ROCE_CAP flag in bnxt_rdma_aux_device_uninit() before proceeding. This is flawed because the flag may not be set initially during driver load. It may be set later after the NVRAM setting is changed followed by a firmware reset. Relying on the BNXT_FLAG_ROCE_CAP flag may crash in bnxt_rdma_aux_device_uninit() if the aux device was never initialized: BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 PGD 8ae6aa067 P4D 0 Oops: 0000 [#1] SMP NOPTI CPU: 39 PID: 42558 Comm: rmmod Kdump: loaded Tainted: G OE --------- - - 4.18.0-348.el8.x86_64 #1 Hardware name: Dell Inc. PowerEdge R750/0WT8Y6, BIOS 1.5.4 12/17/2021 RIP: 0010:device_del+0x1b/0x410 Code: 89 a5 50 03 00 00 4c 89 a5 58 03 00 00 eb 89 0f 1f 44 00 00 41 56 41 55 41 54 4c 8d a7 80 00 00 00 55 53 48 89 fb 48 83 ec 18 <48> 8b 2f 4c 89 e7 65 48 8b 04 25 28 00 00 00 48 89 44 24 10 31 c0 RSP: 0018:ff7f82bf469a7dc8 EFLAGS: 00010292 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000206 RDI: 0000000000000000 RBP: ff31b7cd114b0ac0 R08: 0000000000000000 R09: ffffffff935c3400 R10: ff31b7cd45bc3440 R11: 0000000000000001 R12: 0000000000000080 R13: ffffffffc1069f40 R14: 0000000000000000 R15: 0000000000000000 FS: 00007fc9903ce740(0000) GS:ff31b7d4ffac0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 0000000992fee004 CR4: 0000000000773ee0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: bnxt_rdma_aux_device_uninit+0x1f/0x30 [bnxt_en] bnxt_remove_one+0x2f/0x1f0 [bnxt_en] pci_device_remove+0x3b/0xc0 device_release_driver_internal+0x103/0x1f0 driver_detach+0x54/0x88 bus_remove_driver+0x77/0xc9 pci_unregister_driver+0x2d/0xb0 bnxt_exit+0x16/0x2c [bnxt_en] __x64_sys_delete_module+0x139/0x280 do_syscall_64+0x5b/0x1a0 entry_SYSCALL_64_after_hwframe+0x65/0xca RIP: 0033:0x7fc98f3af71b Fix this by modifying the check inside bnxt_rdma_aux_device_uninit() to check for bp->aux_priv instead. We also need to make some changes in bnxt_rdma_aux_device_init() to make sure that bp->aux_priv is set only when the aux device is fully initialized. Fixes: d80d88b0dfff ("bnxt_en: Add auxiliary driver support") Reviewed-by: Ajit Khaparde Signed-off-by: Kalesh AP Signed-off-by: Michael Chan Signed-off-by: Paolo Abeni --- drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c index e7b5e28ee29f..852eb449ccae 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c @@ -304,7 +304,7 @@ void bnxt_rdma_aux_device_uninit(struct bnxt *bp) struct auxiliary_device *adev; /* Skip if no auxiliary device init was done. */ - if (!(bp->flags & BNXT_FLAG_ROCE_CAP)) + if (!bp->aux_priv) return; aux_priv = bp->aux_priv; @@ -324,6 +324,7 @@ static void bnxt_aux_dev_release(struct device *dev) bp->edev = NULL; kfree(aux_priv->edev); kfree(aux_priv); + bp->aux_priv = NULL; } static void bnxt_set_edev_info(struct bnxt_en_dev *edev, struct bnxt *bp) @@ -359,19 +360,18 @@ void bnxt_rdma_aux_device_init(struct bnxt *bp) if (!(bp->flags & BNXT_FLAG_ROCE_CAP)) return; - bp->aux_priv = kzalloc(sizeof(*bp->aux_priv), GFP_KERNEL); - if (!bp->aux_priv) + aux_priv = kzalloc(sizeof(*bp->aux_priv), GFP_KERNEL); + if (!aux_priv) goto exit; - bp->aux_priv->id = ida_alloc(&bnxt_aux_dev_ids, GFP_KERNEL); - if (bp->aux_priv->id < 0) { + aux_priv->id = ida_alloc(&bnxt_aux_dev_ids, GFP_KERNEL); + if (aux_priv->id < 0) { netdev_warn(bp->dev, "ida alloc failed for ROCE auxiliary device\n"); - kfree(bp->aux_priv); + kfree(aux_priv); goto exit; } - aux_priv = bp->aux_priv; aux_dev = &aux_priv->aux_dev; aux_dev->id = aux_priv->id; aux_dev->name = "rdma"; @@ -380,10 +380,11 @@ void bnxt_rdma_aux_device_init(struct bnxt *bp) rc = auxiliary_device_init(aux_dev); if (rc) { - ida_free(&bnxt_aux_dev_ids, bp->aux_priv->id); - kfree(bp->aux_priv); + ida_free(&bnxt_aux_dev_ids, aux_priv->id); + kfree(aux_priv); goto exit; } + bp->aux_priv = aux_priv; /* From this point, all cleanup will happen via the .release callback & * any error unwinding will need to include a call to From c0e73276f0fcbbd3d4736ba975d7dc7a48791b0c Mon Sep 17 00:00:00 2001 From: Nikita Zhandarovich Date: Mon, 17 Apr 2023 05:07:18 -0700 Subject: [PATCH 131/175] mlxfw: fix null-ptr-deref in mlxfw_mfa2_tlv_next() Function mlxfw_mfa2_tlv_multi_get() returns NULL if 'tlv' in question does not pass checks in mlxfw_mfa2_tlv_payload_get(). This behaviour may lead to NULL pointer dereference in 'multi->total_len'. Fix this issue by testing mlxfw_mfa2_tlv_multi_get()'s return value against NULL. Found by Linux Verification Center (linuxtesting.org) with static analysis tool SVACE. Fixes: 410ed13cae39 ("Add the mlxfw module for Mellanox firmware flash process") Co-developed-by: Natalia Petrova Signed-off-by: Nikita Zhandarovich Reviewed-by: Ido Schimmel Link: https://lore.kernel.org/r/20230417120718.52325-1-n.zhandarovich@fintech.ru Signed-off-by: Paolo Abeni --- drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv_multi.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv_multi.c b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv_multi.c index 017d68f1e123..972c571b4158 100644 --- a/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv_multi.c +++ b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv_multi.c @@ -31,6 +31,8 @@ mlxfw_mfa2_tlv_next(const struct mlxfw_mfa2_file *mfa2_file, if (tlv->type == MLXFW_MFA2_TLV_MULTI_PART) { multi = mlxfw_mfa2_tlv_multi_get(mfa2_file, tlv); + if (!multi) + return NULL; tlv_len = NLA_ALIGN(tlv_len + be16_to_cpu(multi->total_len)); } From 16c52e503043aed1e2a2ce38d9249de5936c1f6b Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Tue, 18 Apr 2023 19:38:58 +0800 Subject: [PATCH 132/175] LoongArch: Make WriteCombine configurable for ioremap() LoongArch maintains cache coherency in hardware, but when paired with LS7A chipsets the WUC attribute (Weak-ordered UnCached, which is similar to WriteCombine) is out of the scope of cache coherency machanism for PCIe devices (this is a PCIe protocol violation, which may be fixed in newer chipsets). This means WUC can only used for write-only memory regions now, so this option is disabled by default, making WUC silently fallback to SUC for ioremap(). You can enable this option if the kernel is ensured to run on hardware without this bug. Kernel parameter writecombine=on/off can be used to override the Kconfig option. Cc: stable@vger.kernel.org Suggested-by: WANG Xuerui Reviewed-by: WANG Xuerui Signed-off-by: Huacai Chen --- .../admin-guide/kernel-parameters.rst | 1 + .../admin-guide/kernel-parameters.txt | 6 ++++++ arch/loongarch/Kconfig | 16 ++++++++++++++ arch/loongarch/include/asm/io.h | 4 +++- arch/loongarch/kernel/setup.c | 21 +++++++++++++++++++ 5 files changed, 47 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/kernel-parameters.rst b/Documentation/admin-guide/kernel-parameters.rst index 19600c50277b..6ae5f129fbca 100644 --- a/Documentation/admin-guide/kernel-parameters.rst +++ b/Documentation/admin-guide/kernel-parameters.rst @@ -128,6 +128,7 @@ parameter is applicable:: KVM Kernel Virtual Machine support is enabled. LIBATA Libata driver is enabled LP Printer support is enabled. + LOONGARCH LoongArch architecture is enabled. LOOP Loopback device support is enabled. M68k M68k architecture is enabled. These options have more detailed description inside of diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 6221a1d057dd..7016cb12dc4e 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6933,6 +6933,12 @@ When enabled, memory and cache locality will be impacted. + writecombine= [LOONGARCH] Control the MAT (Memory Access Type) of + ioremap_wc(). + + on - Enable writecombine, use WUC for ioremap_wc() + off - Disable writecombine, use SUC for ioremap_wc() + x2apic_phys [X86-64,APIC] Use x2apic physical mode instead of default x2apic cluster mode on platforms supporting x2apic. diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index 7fd51257e0ed..3ddde336e6a5 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -447,6 +447,22 @@ config ARCH_IOREMAP protection support. However, you can enable LoongArch DMW-based ioremap() for better performance. +config ARCH_WRITECOMBINE + bool "Enable WriteCombine (WUC) for ioremap()" + help + LoongArch maintains cache coherency in hardware, but when paired + with LS7A chipsets the WUC attribute (Weak-ordered UnCached, which + is similar to WriteCombine) is out of the scope of cache coherency + machanism for PCIe devices (this is a PCIe protocol violation, which + may be fixed in newer chipsets). + + This means WUC can only used for write-only memory regions now, so + this option is disabled by default, making WUC silently fallback to + SUC for ioremap(). You can enable this option if the kernel is ensured + to run on hardware without this bug. + + You can override this setting via writecombine=on/off boot parameter. + config ARCH_STRICT_ALIGN bool "Enable -mstrict-align to prevent unaligned accesses" if EXPERT default y diff --git a/arch/loongarch/include/asm/io.h b/arch/loongarch/include/asm/io.h index 402a7d9e3a53..545e2708fbf7 100644 --- a/arch/loongarch/include/asm/io.h +++ b/arch/loongarch/include/asm/io.h @@ -54,8 +54,10 @@ static inline void __iomem *ioremap_prot(phys_addr_t offset, unsigned long size, * @offset: bus address of the memory * @size: size of the resource to map */ +extern pgprot_t pgprot_wc; + #define ioremap_wc(offset, size) \ - ioremap_prot((offset), (size), pgprot_val(PAGE_KERNEL_WUC)) + ioremap_prot((offset), (size), pgprot_val(pgprot_wc)) #define ioremap_cache(offset, size) \ ioremap_prot((offset), (size), pgprot_val(PAGE_KERNEL)) diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c index bae84ccf6d36..27f71f9531e1 100644 --- a/arch/loongarch/kernel/setup.c +++ b/arch/loongarch/kernel/setup.c @@ -160,6 +160,27 @@ static void __init smbios_parse(void) dmi_walk(find_tokens, NULL); } +#ifdef CONFIG_ARCH_WRITECOMBINE +pgprot_t pgprot_wc = PAGE_KERNEL_WUC; +#else +pgprot_t pgprot_wc = PAGE_KERNEL_SUC; +#endif + +EXPORT_SYMBOL(pgprot_wc); + +static int __init setup_writecombine(char *p) +{ + if (!strcmp(p, "on")) + pgprot_wc = PAGE_KERNEL_WUC; + else if (!strcmp(p, "off")) + pgprot_wc = PAGE_KERNEL_SUC; + else + pr_warn("Unknown writecombine setting \"%s\".\n", p); + + return 0; +} +early_param("writecombine", setup_writecombine); + static int usermem __initdata; static int __init early_parse_mem(char *p) From df830336045db1246d3245d3737fee9939c5f731 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Tue, 18 Apr 2023 19:38:58 +0800 Subject: [PATCH 133/175] LoongArch: Fix probing of the CRC32 feature Not all LoongArch processors support CRC32 instructions. This feature is indicated by CPUCFG1.CRC32 (Bit25) but it is wrongly defined in the previous versions of the ISA manual (and so does in loongarch.h). The CRC32 feature is set unconditionally now, so fix it. BTW, expose the CRC32 feature in /proc/cpuinfo. Cc: stable@vger.kernel.org Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/cpu-features.h | 1 + arch/loongarch/include/asm/cpu.h | 40 ++++++++++++----------- arch/loongarch/include/asm/loongarch.h | 2 +- arch/loongarch/kernel/cpu-probe.c | 7 +++- arch/loongarch/kernel/proc.c | 1 + 5 files changed, 30 insertions(+), 21 deletions(-) diff --git a/arch/loongarch/include/asm/cpu-features.h b/arch/loongarch/include/asm/cpu-features.h index b07974218393..f6177f133477 100644 --- a/arch/loongarch/include/asm/cpu-features.h +++ b/arch/loongarch/include/asm/cpu-features.h @@ -42,6 +42,7 @@ #define cpu_has_fpu cpu_opt(LOONGARCH_CPU_FPU) #define cpu_has_lsx cpu_opt(LOONGARCH_CPU_LSX) #define cpu_has_lasx cpu_opt(LOONGARCH_CPU_LASX) +#define cpu_has_crc32 cpu_opt(LOONGARCH_CPU_CRC32) #define cpu_has_complex cpu_opt(LOONGARCH_CPU_COMPLEX) #define cpu_has_crypto cpu_opt(LOONGARCH_CPU_CRYPTO) #define cpu_has_lvz cpu_opt(LOONGARCH_CPU_LVZ) diff --git a/arch/loongarch/include/asm/cpu.h b/arch/loongarch/include/asm/cpu.h index c3da91759472..88773d849e33 100644 --- a/arch/loongarch/include/asm/cpu.h +++ b/arch/loongarch/include/asm/cpu.h @@ -78,25 +78,26 @@ enum cpu_type_enum { #define CPU_FEATURE_FPU 3 /* CPU has FPU */ #define CPU_FEATURE_LSX 4 /* CPU has LSX (128-bit SIMD) */ #define CPU_FEATURE_LASX 5 /* CPU has LASX (256-bit SIMD) */ -#define CPU_FEATURE_COMPLEX 6 /* CPU has Complex instructions */ -#define CPU_FEATURE_CRYPTO 7 /* CPU has Crypto instructions */ -#define CPU_FEATURE_LVZ 8 /* CPU has Virtualization extension */ -#define CPU_FEATURE_LBT_X86 9 /* CPU has X86 Binary Translation */ -#define CPU_FEATURE_LBT_ARM 10 /* CPU has ARM Binary Translation */ -#define CPU_FEATURE_LBT_MIPS 11 /* CPU has MIPS Binary Translation */ -#define CPU_FEATURE_TLB 12 /* CPU has TLB */ -#define CPU_FEATURE_CSR 13 /* CPU has CSR */ -#define CPU_FEATURE_WATCH 14 /* CPU has watchpoint registers */ -#define CPU_FEATURE_VINT 15 /* CPU has vectored interrupts */ -#define CPU_FEATURE_CSRIPI 16 /* CPU has CSR-IPI */ -#define CPU_FEATURE_EXTIOI 17 /* CPU has EXT-IOI */ -#define CPU_FEATURE_PREFETCH 18 /* CPU has prefetch instructions */ -#define CPU_FEATURE_PMP 19 /* CPU has perfermance counter */ -#define CPU_FEATURE_SCALEFREQ 20 /* CPU supports cpufreq scaling */ -#define CPU_FEATURE_FLATMODE 21 /* CPU has flat mode */ -#define CPU_FEATURE_EIODECODE 22 /* CPU has EXTIOI interrupt pin decode mode */ -#define CPU_FEATURE_GUESTID 23 /* CPU has GuestID feature */ -#define CPU_FEATURE_HYPERVISOR 24 /* CPU has hypervisor (running in VM) */ +#define CPU_FEATURE_CRC32 6 /* CPU has CRC32 instructions */ +#define CPU_FEATURE_COMPLEX 7 /* CPU has Complex instructions */ +#define CPU_FEATURE_CRYPTO 8 /* CPU has Crypto instructions */ +#define CPU_FEATURE_LVZ 9 /* CPU has Virtualization extension */ +#define CPU_FEATURE_LBT_X86 10 /* CPU has X86 Binary Translation */ +#define CPU_FEATURE_LBT_ARM 11 /* CPU has ARM Binary Translation */ +#define CPU_FEATURE_LBT_MIPS 12 /* CPU has MIPS Binary Translation */ +#define CPU_FEATURE_TLB 13 /* CPU has TLB */ +#define CPU_FEATURE_CSR 14 /* CPU has CSR */ +#define CPU_FEATURE_WATCH 15 /* CPU has watchpoint registers */ +#define CPU_FEATURE_VINT 16 /* CPU has vectored interrupts */ +#define CPU_FEATURE_CSRIPI 17 /* CPU has CSR-IPI */ +#define CPU_FEATURE_EXTIOI 18 /* CPU has EXT-IOI */ +#define CPU_FEATURE_PREFETCH 19 /* CPU has prefetch instructions */ +#define CPU_FEATURE_PMP 20 /* CPU has perfermance counter */ +#define CPU_FEATURE_SCALEFREQ 21 /* CPU supports cpufreq scaling */ +#define CPU_FEATURE_FLATMODE 22 /* CPU has flat mode */ +#define CPU_FEATURE_EIODECODE 23 /* CPU has EXTIOI interrupt pin decode mode */ +#define CPU_FEATURE_GUESTID 24 /* CPU has GuestID feature */ +#define CPU_FEATURE_HYPERVISOR 25 /* CPU has hypervisor (running in VM) */ #define LOONGARCH_CPU_CPUCFG BIT_ULL(CPU_FEATURE_CPUCFG) #define LOONGARCH_CPU_LAM BIT_ULL(CPU_FEATURE_LAM) @@ -104,6 +105,7 @@ enum cpu_type_enum { #define LOONGARCH_CPU_FPU BIT_ULL(CPU_FEATURE_FPU) #define LOONGARCH_CPU_LSX BIT_ULL(CPU_FEATURE_LSX) #define LOONGARCH_CPU_LASX BIT_ULL(CPU_FEATURE_LASX) +#define LOONGARCH_CPU_CRC32 BIT_ULL(CPU_FEATURE_CRC32) #define LOONGARCH_CPU_COMPLEX BIT_ULL(CPU_FEATURE_COMPLEX) #define LOONGARCH_CPU_CRYPTO BIT_ULL(CPU_FEATURE_CRYPTO) #define LOONGARCH_CPU_LVZ BIT_ULL(CPU_FEATURE_LVZ) diff --git a/arch/loongarch/include/asm/loongarch.h b/arch/loongarch/include/asm/loongarch.h index 65b7dcdea16d..8c2969965c3c 100644 --- a/arch/loongarch/include/asm/loongarch.h +++ b/arch/loongarch/include/asm/loongarch.h @@ -117,7 +117,7 @@ static inline u32 read_cpucfg(u32 reg) #define CPUCFG1_EP BIT(22) #define CPUCFG1_RPLV BIT(23) #define CPUCFG1_HUGEPG BIT(24) -#define CPUCFG1_IOCSRBRD BIT(25) +#define CPUCFG1_CRC32 BIT(25) #define CPUCFG1_MSGINT BIT(26) #define LOONGARCH_CPUCFG2 0x2 diff --git a/arch/loongarch/kernel/cpu-probe.c b/arch/loongarch/kernel/cpu-probe.c index 3a3fce2d7846..482643167119 100644 --- a/arch/loongarch/kernel/cpu-probe.c +++ b/arch/loongarch/kernel/cpu-probe.c @@ -94,13 +94,18 @@ static void cpu_probe_common(struct cpuinfo_loongarch *c) c->options = LOONGARCH_CPU_CPUCFG | LOONGARCH_CPU_CSR | LOONGARCH_CPU_TLB | LOONGARCH_CPU_VINT | LOONGARCH_CPU_WATCH; - elf_hwcap = HWCAP_LOONGARCH_CPUCFG | HWCAP_LOONGARCH_CRC32; + elf_hwcap = HWCAP_LOONGARCH_CPUCFG; config = read_cpucfg(LOONGARCH_CPUCFG1); if (config & CPUCFG1_UAL) { c->options |= LOONGARCH_CPU_UAL; elf_hwcap |= HWCAP_LOONGARCH_UAL; } + if (config & CPUCFG1_CRC32) { + c->options |= LOONGARCH_CPU_CRC32; + elf_hwcap |= HWCAP_LOONGARCH_CRC32; + } + config = read_cpucfg(LOONGARCH_CPUCFG2); if (config & CPUCFG2_LAM) { diff --git a/arch/loongarch/kernel/proc.c b/arch/loongarch/kernel/proc.c index 5c67cc4fd56d..0d82907b5404 100644 --- a/arch/loongarch/kernel/proc.c +++ b/arch/loongarch/kernel/proc.c @@ -76,6 +76,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) if (cpu_has_fpu) seq_printf(m, " fpu"); if (cpu_has_lsx) seq_printf(m, " lsx"); if (cpu_has_lasx) seq_printf(m, " lasx"); + if (cpu_has_crc32) seq_printf(m, " crc32"); if (cpu_has_complex) seq_printf(m, " complex"); if (cpu_has_crypto) seq_printf(m, " crypto"); if (cpu_has_lvz) seq_printf(m, " lvz"); From 1cf62488f5e465b1cd814d19be238a4b7ad5be38 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Tue, 18 Apr 2023 19:38:58 +0800 Subject: [PATCH 134/175] LoongArch: Fix build error if CONFIG_SUSPEND is not set We can see the following build error on LoongArch if CONFIG_SUSPEND is not set: ld: drivers/acpi/sleep.o: in function 'acpi_pm_prepare': sleep.c:(.text+0x2b8): undefined reference to 'loongarch_wakeup_start' Here is the call trace: acpi_pm_prepare() __acpi_pm_prepare() acpi_sleep_prepare() acpi_get_wakeup_address() loongarch_wakeup_start() Root cause: loongarch_wakeup_start() is defined in arch/loongarch/power/ suspend_asm.S which is only built under CONFIG_SUSPEND. In order to fix the build error, just let acpi_get_wakeup_address() return 0 if CONFIG_ SUSPEND is not set. Fixes: 366bb35a8e48 ("LoongArch: Add suspend (ACPI S3) support") Reviewed-by: WANG Xuerui Reported-by: Randy Dunlap Link: https://lore.kernel.org/all/11215033-fa3c-ecb1-2fc0-e9aeba47be9b@infradead.org/ Signed-off-by: Tiezhu Yang Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/acpi.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/loongarch/include/asm/acpi.h b/arch/loongarch/include/asm/acpi.h index 4198753aa1d0..976a810352c6 100644 --- a/arch/loongarch/include/asm/acpi.h +++ b/arch/loongarch/include/asm/acpi.h @@ -41,8 +41,11 @@ extern void loongarch_suspend_enter(void); static inline unsigned long acpi_get_wakeup_address(void) { +#ifdef CONFIG_SUSPEND extern void loongarch_wakeup_start(void); return (unsigned long)loongarch_wakeup_start; +#endif + return 0UL; } #endif /* _ASM_LOONGARCH_ACPI_H */ From 6637775ca3c3d0b4f7b83c5ce9a592df2c9bff52 Mon Sep 17 00:00:00 2001 From: Qing Zhang Date: Tue, 18 Apr 2023 19:38:58 +0800 Subject: [PATCH 135/175] LoongArch: Fix _CONST64_(x) as unsigned Addresses should all be of unsigned type to avoid unnecessary conversions. Signed-off-by: Qing Zhang Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/addrspace.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/loongarch/include/asm/addrspace.h b/arch/loongarch/include/asm/addrspace.h index 8fb699b4d40a..5c9c03bdf915 100644 --- a/arch/loongarch/include/asm/addrspace.h +++ b/arch/loongarch/include/asm/addrspace.h @@ -71,9 +71,9 @@ extern unsigned long vm_map_base; #define _ATYPE32_ int #define _ATYPE64_ __s64 #ifdef CONFIG_64BIT -#define _CONST64_(x) x ## L +#define _CONST64_(x) x ## UL #else -#define _CONST64_(x) x ## LL +#define _CONST64_(x) x ## ULL #endif #endif From 1c1378a4090845e12c4dbbb337de7acce309b570 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Tue, 18 Apr 2023 19:38:58 +0800 Subject: [PATCH 136/175] LoongArch: Enable PG when wakeup from suspend Some firmwares don't enable PG when wakeup from suspend, so do it in kernel. This can improve code compatibility for boot kernel. Signed-off-by: Baoqi Zhang Signed-off-by: Huacai Chen --- arch/loongarch/power/suspend_asm.S | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/loongarch/power/suspend_asm.S b/arch/loongarch/power/suspend_asm.S index 90da899c06a1..e2fc3b4e31f0 100644 --- a/arch/loongarch/power/suspend_asm.S +++ b/arch/loongarch/power/suspend_asm.S @@ -80,6 +80,10 @@ SYM_INNER_LABEL(loongarch_wakeup_start, SYM_L_GLOBAL) JUMP_VIRT_ADDR t0, t1 + /* Enable PG */ + li.w t0, 0xb0 # PLV=0, IE=0, PG=1 + csrwr t0, LOONGARCH_CSR_CRMD + la.pcrel t0, acpi_saved_sp ld.d sp, t0, 0 SETUP_WAKEUP From dce5ea1d0f45fa612f5760b88614a3f32bc75e3f Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Tue, 18 Apr 2023 19:38:58 +0800 Subject: [PATCH 137/175] LoongArch: Mark 3 symbol exports as non-GPL vm_map_base, empty_zero_page and invalid_pmd_table could be accessed widely by some out-of-tree non-GPL but important file systems or drivers (e.g. OpenZFS). Let's use EXPORT_SYMBOL() instead of EXPORT_SYMBOL_GPL() to export them, so as to avoid build errors. 1, Details about vm_map_base: This is a LoongArch-specific symbol and may be referenced through macros PCI_IOBASE, VMALLOC_START and VMALLOC_END. 2, Details about empty_zero_page: As it stands today, only 3 architectures export empty_zero_page as a GPL symbol: IA64, LoongArch and MIPS. LoongArch gets the GPL export by inheriting from MIPS, and the MIPS export was first introduced in commit 497d2adcbf50b ("[MIPS] Export empty_zero_page for sake of the ext4 module."). The IA64 export was similar: commit a7d57ecf4216e ("[IA64] Export three symbols for module use") did so for kvm. In both IA64 and MIPS, the export of empty_zero_page was done for satisfying some in-kernel component built as module (kvm and ext4 respectively), and given its reasonably low-level nature, GPL is a reasonable choice. But looking at the bigger picture it is evident most other architectures do not regard it as GPL, so in effect the symbol probably should not be treated as such, in favor of consistency. 3, Details about invalid_pmd_table: Keep consistency with invalid_pte_table and make it be possible by some modules. Cc: stable@vger.kernel.org Reviewed-by: WANG Xuerui Signed-off-by: Huacai Chen --- arch/loongarch/kernel/cpu-probe.c | 2 +- arch/loongarch/mm/init.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/loongarch/kernel/cpu-probe.c b/arch/loongarch/kernel/cpu-probe.c index 482643167119..5adf0f736c6d 100644 --- a/arch/loongarch/kernel/cpu-probe.c +++ b/arch/loongarch/kernel/cpu-probe.c @@ -60,7 +60,7 @@ static inline void set_elf_platform(int cpu, const char *plat) /* MAP BASE */ unsigned long vm_map_base; -EXPORT_SYMBOL_GPL(vm_map_base); +EXPORT_SYMBOL(vm_map_base); static void cpu_probe_addrbits(struct cpuinfo_loongarch *c) { diff --git a/arch/loongarch/mm/init.c b/arch/loongarch/mm/init.c index e018aed34586..3b7d8129570b 100644 --- a/arch/loongarch/mm/init.c +++ b/arch/loongarch/mm/init.c @@ -41,7 +41,7 @@ * don't have to care about aliases on other CPUs. */ unsigned long empty_zero_page, zero_page_mask; -EXPORT_SYMBOL_GPL(empty_zero_page); +EXPORT_SYMBOL(empty_zero_page); EXPORT_SYMBOL(zero_page_mask); void setup_zero_pages(void) @@ -270,7 +270,7 @@ pud_t invalid_pud_table[PTRS_PER_PUD] __page_aligned_bss; #endif #ifndef __PAGETABLE_PMD_FOLDED pmd_t invalid_pmd_table[PTRS_PER_PMD] __page_aligned_bss; -EXPORT_SYMBOL_GPL(invalid_pmd_table); +EXPORT_SYMBOL(invalid_pmd_table); #endif pte_t invalid_pte_table[PTRS_PER_PTE] __page_aligned_bss; EXPORT_SYMBOL(invalid_pte_table); From 93eb1215ed794a18ba8753e0654f069d58838966 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Tue, 18 Apr 2023 19:38:58 +0800 Subject: [PATCH 138/175] LoongArch: module: set section addresses to 0x0 These got*, plt* and .text.ftrace_trampoline sections specified for LoongArch have non-zero addressses. Non-zero section addresses in a relocatable ELF would confuse GDB when it tries to compute the section offsets and it ends up printing wrong symbol addresses. Therefore, set them to zero, which mirrors the change in commit 5d8591bc0fbaeb6ded ("arm64 module: set plt* section addresses to 0x0"). Cc: stable@vger.kernel.org Reviewed-by: Guo Ren Signed-off-by: Chong Qiao Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/module.lds.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/loongarch/include/asm/module.lds.h b/arch/loongarch/include/asm/module.lds.h index 438f09d4ccf4..88554f92e010 100644 --- a/arch/loongarch/include/asm/module.lds.h +++ b/arch/loongarch/include/asm/module.lds.h @@ -2,8 +2,8 @@ /* Copyright (C) 2020-2022 Loongson Technology Corporation Limited */ SECTIONS { . = ALIGN(4); - .got : { BYTE(0) } - .plt : { BYTE(0) } - .plt.idx : { BYTE(0) } - .ftrace_trampoline : { BYTE(0) } + .got 0 : { BYTE(0) } + .plt 0 : { BYTE(0) } + .plt.idx 0 : { BYTE(0) } + .ftrace_trampoline 0 : { BYTE(0) } } From 8267fc71abb2dc47338570e56dd3473a58313fce Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Mon, 17 Apr 2023 23:53:22 +0200 Subject: [PATCH 139/175] veth: take into account peer device for NETDEV_XDP_ACT_NDO_XMIT xdp_features flag For veth pairs, NETDEV_XDP_ACT_NDO_XMIT is supported by the current device if the peer one is running a XDP program or if it has GRO enabled. Fix the xdp_features flags reporting considering peer device and not current one for NETDEV_XDP_ACT_NDO_XMIT. Fixes: fccca038f300 ("veth: take into account device reconfiguration for xdp_features flag") Signed-off-by: Lorenzo Bianconi Link: https://lore.kernel.org/r/4f1ca6f6f6b42ae125bfdb5c7782217c83968b2e.1681767806.git.lorenzo@kernel.org Signed-off-by: Alexei Starovoitov --- drivers/net/veth.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index e1b38fbf1dd9..4b3c6647edc6 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -1262,11 +1262,12 @@ static void veth_set_xdp_features(struct net_device *dev) peer = rtnl_dereference(priv->peer); if (peer && peer->real_num_tx_queues <= dev->real_num_rx_queues) { + struct veth_priv *priv_peer = netdev_priv(peer); xdp_features_t val = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT | NETDEV_XDP_ACT_RX_SG; - if (priv->_xdp_prog || veth_gro_requested(dev)) + if (priv_peer->_xdp_prog || veth_gro_requested(peer)) val |= NETDEV_XDP_ACT_NDO_XMIT | NETDEV_XDP_ACT_NDO_XMIT_SG; xdp_set_features_flag(dev, val); @@ -1504,19 +1505,23 @@ static int veth_set_features(struct net_device *dev, { netdev_features_t changed = features ^ dev->features; struct veth_priv *priv = netdev_priv(dev); + struct net_device *peer; int err; if (!(changed & NETIF_F_GRO) || !(dev->flags & IFF_UP) || priv->_xdp_prog) return 0; + peer = rtnl_dereference(priv->peer); if (features & NETIF_F_GRO) { err = veth_napi_enable(dev); if (err) return err; - xdp_features_set_redirect_target(dev, true); + if (peer) + xdp_features_set_redirect_target(peer, true); } else { - xdp_features_clear_redirect_target(dev); + if (peer) + xdp_features_clear_redirect_target(peer); veth_napi_del(dev); } return 0; @@ -1598,13 +1603,13 @@ static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog, peer->max_mtu = max_mtu; } - xdp_features_set_redirect_target(dev, true); + xdp_features_set_redirect_target(peer, true); } if (old_prog) { if (!prog) { - if (!veth_gro_requested(dev)) - xdp_features_clear_redirect_target(dev); + if (peer && !veth_gro_requested(dev)) + xdp_features_clear_redirect_target(peer); if (dev->flags & IFF_UP) veth_disable_xdp(dev); From 659c0ce1cb9efc7f58d380ca4bb2a51ae9e30553 Mon Sep 17 00:00:00 2001 From: Ondrej Mosnacek Date: Fri, 17 Feb 2023 17:21:54 +0100 Subject: [PATCH 140/175] kernel/sys.c: fix and improve control flow in __sys_setres[ug]id() Linux Security Modules (LSMs) that implement the "capable" hook will usually emit an access denial message to the audit log whenever they "block" the current task from using the given capability based on their security policy. The occurrence of a denial is used as an indication that the given task has attempted an operation that requires the given access permission, so the callers of functions that perform LSM permission checks must take care to avoid calling them too early (before it is decided if the permission is actually needed to perform the requested operation). The __sys_setres[ug]id() functions violate this convention by first calling ns_capable_setid() and only then checking if the operation requires the capability or not. It means that any caller that has the capability granted by DAC (task's capability set) but not by MAC (LSMs) will generate a "denied" audit record, even if is doing an operation for which the capability is not required. Fix this by reordering the checks such that ns_capable_setid() is checked last and -EPERM is returned immediately if it returns false. While there, also do two small optimizations: * move the capability check before prepare_creds() and * bail out early in case of a no-op. Link: https://lkml.kernel.org/r/20230217162154.837549-1-omosnace@redhat.com Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Ondrej Mosnacek Cc: Eric W. Biederman Cc: Signed-off-by: Andrew Morton --- kernel/sys.c | 69 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 40 insertions(+), 29 deletions(-) diff --git a/kernel/sys.c b/kernel/sys.c index 495cd87d9bf4..351de7916302 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -664,6 +664,7 @@ long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) struct cred *new; int retval; kuid_t kruid, keuid, ksuid; + bool ruid_new, euid_new, suid_new; kruid = make_kuid(ns, ruid); keuid = make_kuid(ns, euid); @@ -678,25 +679,29 @@ long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) if ((suid != (uid_t) -1) && !uid_valid(ksuid)) return -EINVAL; + old = current_cred(); + + /* check for no-op */ + if ((ruid == (uid_t) -1 || uid_eq(kruid, old->uid)) && + (euid == (uid_t) -1 || (uid_eq(keuid, old->euid) && + uid_eq(keuid, old->fsuid))) && + (suid == (uid_t) -1 || uid_eq(ksuid, old->suid))) + return 0; + + ruid_new = ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) && + !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid); + euid_new = euid != (uid_t) -1 && !uid_eq(keuid, old->uid) && + !uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid); + suid_new = suid != (uid_t) -1 && !uid_eq(ksuid, old->uid) && + !uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid); + if ((ruid_new || euid_new || suid_new) && + !ns_capable_setid(old->user_ns, CAP_SETUID)) + return -EPERM; + new = prepare_creds(); if (!new) return -ENOMEM; - old = current_cred(); - - retval = -EPERM; - if (!ns_capable_setid(old->user_ns, CAP_SETUID)) { - if (ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) && - !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid)) - goto error; - if (euid != (uid_t) -1 && !uid_eq(keuid, old->uid) && - !uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid)) - goto error; - if (suid != (uid_t) -1 && !uid_eq(ksuid, old->uid) && - !uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid)) - goto error; - } - if (ruid != (uid_t) -1) { new->uid = kruid; if (!uid_eq(kruid, old->uid)) { @@ -761,6 +766,7 @@ long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) struct cred *new; int retval; kgid_t krgid, kegid, ksgid; + bool rgid_new, egid_new, sgid_new; krgid = make_kgid(ns, rgid); kegid = make_kgid(ns, egid); @@ -773,23 +779,28 @@ long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) if ((sgid != (gid_t) -1) && !gid_valid(ksgid)) return -EINVAL; + old = current_cred(); + + /* check for no-op */ + if ((rgid == (gid_t) -1 || gid_eq(krgid, old->gid)) && + (egid == (gid_t) -1 || (gid_eq(kegid, old->egid) && + gid_eq(kegid, old->fsgid))) && + (sgid == (gid_t) -1 || gid_eq(ksgid, old->sgid))) + return 0; + + rgid_new = rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) && + !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid); + egid_new = egid != (gid_t) -1 && !gid_eq(kegid, old->gid) && + !gid_eq(kegid, old->egid) && !gid_eq(kegid, old->sgid); + sgid_new = sgid != (gid_t) -1 && !gid_eq(ksgid, old->gid) && + !gid_eq(ksgid, old->egid) && !gid_eq(ksgid, old->sgid); + if ((rgid_new || egid_new || sgid_new) && + !ns_capable_setid(old->user_ns, CAP_SETGID)) + return -EPERM; + new = prepare_creds(); if (!new) return -ENOMEM; - old = current_cred(); - - retval = -EPERM; - if (!ns_capable_setid(old->user_ns, CAP_SETGID)) { - if (rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) && - !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid)) - goto error; - if (egid != (gid_t) -1 && !gid_eq(kegid, old->gid) && - !gid_eq(kegid, old->egid) && !gid_eq(kegid, old->sgid)) - goto error; - if (sgid != (gid_t) -1 && !gid_eq(ksgid, old->gid) && - !gid_eq(ksgid, old->egid) && !gid_eq(ksgid, old->sgid)) - goto error; - } if (rgid != (gid_t) -1) new->gid = krgid; From 1007843a91909a4995ee78a538f62d8665705b66 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Tue, 4 Apr 2023 23:31:58 +0900 Subject: [PATCH 141/175] mm/page_alloc: fix potential deadlock on zonelist_update_seq seqlock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit syzbot is reporting circular locking dependency which involves zonelist_update_seq seqlock [1], for this lock is checked by memory allocation requests which do not need to be retried. One deadlock scenario is kmalloc(GFP_ATOMIC) from an interrupt handler. CPU0 ---- __build_all_zonelists() { write_seqlock(&zonelist_update_seq); // makes zonelist_update_seq.seqcount odd // e.g. timer interrupt handler runs at this moment some_timer_func() { kmalloc(GFP_ATOMIC) { __alloc_pages_slowpath() { read_seqbegin(&zonelist_update_seq) { // spins forever because zonelist_update_seq.seqcount is odd } } } } // e.g. timer interrupt handler finishes write_sequnlock(&zonelist_update_seq); // makes zonelist_update_seq.seqcount even } This deadlock scenario can be easily eliminated by not calling read_seqbegin(&zonelist_update_seq) from !__GFP_DIRECT_RECLAIM allocation requests, for retry is applicable to only __GFP_DIRECT_RECLAIM allocation requests. But Michal Hocko does not know whether we should go with this approach. Another deadlock scenario which syzbot is reporting is a race between kmalloc(GFP_ATOMIC) from tty_insert_flip_string_and_push_buffer() with port->lock held and printk() from __build_all_zonelists() with zonelist_update_seq held. CPU0 CPU1 ---- ---- pty_write() { tty_insert_flip_string_and_push_buffer() { __build_all_zonelists() { write_seqlock(&zonelist_update_seq); build_zonelists() { printk() { vprintk() { vprintk_default() { vprintk_emit() { console_unlock() { console_flush_all() { console_emit_next_record() { con->write() = serial8250_console_write() { spin_lock_irqsave(&port->lock, flags); tty_insert_flip_string() { tty_insert_flip_string_fixed_flag() { __tty_buffer_request_room() { tty_buffer_alloc() { kmalloc(GFP_ATOMIC | __GFP_NOWARN) { __alloc_pages_slowpath() { zonelist_iter_begin() { read_seqbegin(&zonelist_update_seq); // spins forever because zonelist_update_seq.seqcount is odd spin_lock_irqsave(&port->lock, flags); // spins forever because port->lock is held } } } } } } } } spin_unlock_irqrestore(&port->lock, flags); // message is printed to console spin_unlock_irqrestore(&port->lock, flags); } } } } } } } } } write_sequnlock(&zonelist_update_seq); } } } This deadlock scenario can be eliminated by preventing interrupt context from calling kmalloc(GFP_ATOMIC) and preventing printk() from calling console_flush_all() while zonelist_update_seq.seqcount is odd. Since Petr Mladek thinks that __build_all_zonelists() can become a candidate for deferring printk() [2], let's address this problem by disabling local interrupts in order to avoid kmalloc(GFP_ATOMIC) and disabling synchronous printk() in order to avoid console_flush_all() . As a side effect of minimizing duration of zonelist_update_seq.seqcount being odd by disabling synchronous printk(), latency at read_seqbegin(&zonelist_update_seq) for both !__GFP_DIRECT_RECLAIM and __GFP_DIRECT_RECLAIM allocation requests will be reduced. Although, from lockdep perspective, not calling read_seqbegin(&zonelist_update_seq) (i.e. do not record unnecessary locking dependency) from interrupt context is still preferable, even if we don't allow calling kmalloc(GFP_ATOMIC) inside write_seqlock(&zonelist_update_seq)/write_sequnlock(&zonelist_update_seq) section... Link: https://lkml.kernel.org/r/8796b95c-3da3-5885-fddd-6ef55f30e4d3@I-love.SAKURA.ne.jp Fixes: 3d36424b3b58 ("mm/page_alloc: fix race condition between build_all_zonelists and page allocation") Link: https://lkml.kernel.org/r/ZCrs+1cDqPWTDFNM@alley [2] Reported-by: syzbot Link: https://syzkaller.appspot.com/bug?extid=223c7461c58c58a4cb10 [1] Signed-off-by: Tetsuo Handa Acked-by: Michal Hocko Acked-by: Mel Gorman Cc: Petr Mladek Cc: David Hildenbrand Cc: Ilpo Järvinen Cc: John Ogness Cc: Patrick Daly Cc: Sergey Senozhatsky Cc: Steven Rostedt Cc: Signed-off-by: Andrew Morton --- mm/page_alloc.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7136c36c5d01..e8b4f294d763 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6632,7 +6632,21 @@ static void __build_all_zonelists(void *data) int nid; int __maybe_unused cpu; pg_data_t *self = data; + unsigned long flags; + /* + * Explicitly disable this CPU's interrupts before taking seqlock + * to prevent any IRQ handler from calling into the page allocator + * (e.g. GFP_ATOMIC) that could hit zonelist_iter_begin and livelock. + */ + local_irq_save(flags); + /* + * Explicitly disable this CPU's synchronous printk() before taking + * seqlock to prevent any printk() from trying to hold port->lock, for + * tty_insert_flip_string_and_push_buffer() on other CPU might be + * calling kmalloc(GFP_ATOMIC | __GFP_NOWARN) with port->lock held. + */ + printk_deferred_enter(); write_seqlock(&zonelist_update_seq); #ifdef CONFIG_NUMA @@ -6671,6 +6685,8 @@ static void __build_all_zonelists(void *data) } write_sequnlock(&zonelist_update_seq); + printk_deferred_exit(); + local_irq_restore(flags); } static noinline void __init From b20b0368c614c609badfe16fbd113dfb4780acd9 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 30 Mar 2023 09:38:22 -0400 Subject: [PATCH 142/175] mm: fix memory leak on mm_init error handling commit f1a7941243c1 ("mm: convert mm's rss stats into percpu_counter") introduces a memory leak by missing a call to destroy_context() when a percpu_counter fails to allocate. Before introducing the per-cpu counter allocations, init_new_context() was the last call that could fail in mm_init(), and thus there was no need to ever invoke destroy_context() in the error paths. Adding the following percpu counter allocations adds error paths after init_new_context(), which means its associated destroy_context() needs to be called when percpu counters fail to allocate. Link: https://lkml.kernel.org/r/20230330133822.66271-1-mathieu.desnoyers@efficios.com Fixes: f1a7941243c1 ("mm: convert mm's rss stats into percpu_counter") Signed-off-by: Mathieu Desnoyers Acked-by: Shakeel Butt Cc: Marek Szyprowski Cc: Signed-off-by: Andrew Morton --- kernel/fork.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/fork.c b/kernel/fork.c index 0c92f224c68c..ea332319dffe 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1174,6 +1174,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, fail_pcpu: while (i > 0) percpu_counter_destroy(&mm->rss_stat[--i]); + destroy_context(mm); fail_nocontext: mm_free_pgd(mm); fail_nopgd: From a101482421a318369eef2d0e03f2fcb40a47abad Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 15 Apr 2023 20:31:10 +0000 Subject: [PATCH 143/175] tools/Makefile: do missed s/vm/mm/ Commit 799fb82aa132 ("tools/vm: rename tools/vm to tools/mm") missed renaming 'vm' in 'tools/Makefile' to 'mm'. As a result, 'make clean' under 'tools/' directory fails as below: $ make -C tools clean DESCEND vm make[1]: Entering directory '/linux/tools/vm' make[1]: *** No rule to make target 'clean'. Stop. make[1]: Leaving directory '/linux/tools/vm' make: *** [Makefile:173: vm_clean] Error 2 make: Leaving directory '/linux/tools' Do the missed rename. Link: https://lkml.kernel.org/r/20230415203110.13858-1-sj@kernel.org Fixes: 799fb82aa132 ("tools/vm: rename tools/vm to tools/mm") Signed-off-by: SeongJae Park Reported-by: Ricardo Pardini Link: https://lore.kernel.org/linux-mm/20230415202454.13558-1-sj@kernel.org/ Tested-by: Ricardo Pardini Signed-off-by: Andrew Morton --- tools/Makefile | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tools/Makefile b/tools/Makefile index e497875fc7e3..37e9f6804832 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -39,7 +39,7 @@ help: @echo ' turbostat - Intel CPU idle stats and freq reporting tool' @echo ' usb - USB testing tools' @echo ' virtio - vhost test module' - @echo ' vm - misc vm tools' + @echo ' mm - misc mm tools' @echo ' wmi - WMI interface examples' @echo ' x86_energy_perf_policy - Intel energy policy tool' @echo '' @@ -69,7 +69,7 @@ acpi: FORCE cpupower: FORCE $(call descend,power/$@) -cgroup counter firewire hv guest bootconfig spi usb virtio vm bpf iio gpio objtool leds wmi pci firmware debugging tracing: FORCE +cgroup counter firewire hv guest bootconfig spi usb virtio mm bpf iio gpio objtool leds wmi pci firmware debugging tracing: FORCE $(call descend,$@) bpf/%: FORCE @@ -118,7 +118,7 @@ kvm_stat: FORCE all: acpi cgroup counter cpupower gpio hv firewire \ perf selftests bootconfig spi turbostat usb \ - virtio vm bpf x86_energy_perf_policy \ + virtio mm bpf x86_energy_perf_policy \ tmon freefall iio objtool kvm_stat wmi \ pci debugging tracing thermal thermometer thermal-engine @@ -128,7 +128,7 @@ acpi_install: cpupower_install: $(call descend,power/$(@:_install=),install) -cgroup_install counter_install firewire_install gpio_install hv_install iio_install perf_install bootconfig_install spi_install usb_install virtio_install vm_install bpf_install objtool_install wmi_install pci_install debugging_install tracing_install: +cgroup_install counter_install firewire_install gpio_install hv_install iio_install perf_install bootconfig_install spi_install usb_install virtio_install mm_install bpf_install objtool_install wmi_install pci_install debugging_install tracing_install: $(call descend,$(@:_install=),install) selftests_install: @@ -158,7 +158,7 @@ kvm_stat_install: install: acpi_install cgroup_install counter_install cpupower_install gpio_install \ hv_install firewire_install iio_install \ perf_install selftests_install turbostat_install usb_install \ - virtio_install vm_install bpf_install x86_energy_perf_policy_install \ + virtio_install mm_install bpf_install x86_energy_perf_policy_install \ tmon_install freefall_install objtool_install kvm_stat_install \ wmi_install pci_install debugging_install intel-speed-select_install \ tracing_install thermometer_install thermal-engine_install @@ -169,7 +169,7 @@ acpi_clean: cpupower_clean: $(call descend,power/cpupower,clean) -cgroup_clean counter_clean hv_clean firewire_clean bootconfig_clean spi_clean usb_clean virtio_clean vm_clean wmi_clean bpf_clean iio_clean gpio_clean objtool_clean leds_clean pci_clean firmware_clean debugging_clean tracing_clean: +cgroup_clean counter_clean hv_clean firewire_clean bootconfig_clean spi_clean usb_clean virtio_clean mm_clean wmi_clean bpf_clean iio_clean gpio_clean objtool_clean leds_clean pci_clean firmware_clean debugging_clean tracing_clean: $(call descend,$(@:_clean=),clean) libapi_clean: @@ -211,7 +211,7 @@ build_clean: clean: acpi_clean cgroup_clean counter_clean cpupower_clean hv_clean firewire_clean \ perf_clean selftests_clean turbostat_clean bootconfig_clean spi_clean usb_clean virtio_clean \ - vm_clean bpf_clean iio_clean x86_energy_perf_policy_clean tmon_clean \ + mm_clean bpf_clean iio_clean x86_energy_perf_policy_clean tmon_clean \ freefall_clean build_clean libbpf_clean libsubcmd_clean \ gpio_clean objtool_clean leds_clean wmi_clean pci_clean firmware_clean debugging_clean \ intel-speed-select_clean tracing_clean thermal_clean thermometer_clean thermal-engine_clean From 47ebd0310e89c087f56e58c103c44b72a2f6b216 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 13 Apr 2023 15:12:20 +0200 Subject: [PATCH 144/175] mm: kmsan: handle alloc failures in kmsan_vmap_pages_range_noflush() As reported by Dipanjan Das, when KMSAN is used together with kernel fault injection (or, generally, even without the latter), calls to kcalloc() or __vmap_pages_range_noflush() may fail, leaving the metadata mappings for the virtual mapping in an inconsistent state. When these metadata mappings are accessed later, the kernel crashes. To address the problem, we return a non-zero error code from kmsan_vmap_pages_range_noflush() in the case of any allocation/mapping failure inside it, and make vmap_pages_range_noflush() return an error if KMSAN fails to allocate the metadata. This patch also removes KMSAN_WARN_ON() from vmap_pages_range_noflush(), as these allocation failures are not fatal anymore. Link: https://lkml.kernel.org/r/20230413131223.4135168-1-glider@google.com Fixes: b073d7f8aee4 ("mm: kmsan: maintain KMSAN metadata for page operations") Signed-off-by: Alexander Potapenko Reported-by: Dipanjan Das Link: https://lore.kernel.org/linux-mm/CANX2M5ZRrRA64k0hOif02TjmY9kbbO2aCBPyq79es34RXZ=cAw@mail.gmail.com/ Reviewed-by: Marco Elver Cc: Christoph Hellwig Cc: Dmitry Vyukov Cc: Uladzislau Rezki (Sony) Cc: Signed-off-by: Andrew Morton --- include/linux/kmsan.h | 20 +++++++++++--------- mm/kmsan/shadow.c | 27 ++++++++++++++++++--------- mm/vmalloc.c | 6 +++++- 3 files changed, 34 insertions(+), 19 deletions(-) diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h index e38ae3c34618..c7ff3aefc5a1 100644 --- a/include/linux/kmsan.h +++ b/include/linux/kmsan.h @@ -134,11 +134,12 @@ void kmsan_kfree_large(const void *ptr); * @page_shift: page_shift passed to vmap_range_noflush(). * * KMSAN maps shadow and origin pages of @pages into contiguous ranges in - * vmalloc metadata address range. + * vmalloc metadata address range. Returns 0 on success, callers must check + * for non-zero return value. */ -void kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, - pgprot_t prot, struct page **pages, - unsigned int page_shift); +int kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, + pgprot_t prot, struct page **pages, + unsigned int page_shift); /** * kmsan_vunmap_kernel_range_noflush() - Notify KMSAN about a vunmap. @@ -281,12 +282,13 @@ static inline void kmsan_kfree_large(const void *ptr) { } -static inline void kmsan_vmap_pages_range_noflush(unsigned long start, - unsigned long end, - pgprot_t prot, - struct page **pages, - unsigned int page_shift) +static inline int kmsan_vmap_pages_range_noflush(unsigned long start, + unsigned long end, + pgprot_t prot, + struct page **pages, + unsigned int page_shift) { + return 0; } static inline void kmsan_vunmap_range_noflush(unsigned long start, diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c index a787c04e9583..b8bb95eea5e3 100644 --- a/mm/kmsan/shadow.c +++ b/mm/kmsan/shadow.c @@ -216,27 +216,29 @@ void kmsan_free_page(struct page *page, unsigned int order) kmsan_leave_runtime(); } -void kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, - pgprot_t prot, struct page **pages, - unsigned int page_shift) +int kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, + pgprot_t prot, struct page **pages, + unsigned int page_shift) { unsigned long shadow_start, origin_start, shadow_end, origin_end; struct page **s_pages, **o_pages; - int nr, mapped; + int nr, mapped, err = 0; if (!kmsan_enabled) - return; + return 0; shadow_start = vmalloc_meta((void *)start, KMSAN_META_SHADOW); shadow_end = vmalloc_meta((void *)end, KMSAN_META_SHADOW); if (!shadow_start) - return; + return 0; nr = (end - start) / PAGE_SIZE; s_pages = kcalloc(nr, sizeof(*s_pages), GFP_KERNEL); o_pages = kcalloc(nr, sizeof(*o_pages), GFP_KERNEL); - if (!s_pages || !o_pages) + if (!s_pages || !o_pages) { + err = -ENOMEM; goto ret; + } for (int i = 0; i < nr; i++) { s_pages[i] = shadow_page_for(pages[i]); o_pages[i] = origin_page_for(pages[i]); @@ -249,10 +251,16 @@ void kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, kmsan_enter_runtime(); mapped = __vmap_pages_range_noflush(shadow_start, shadow_end, prot, s_pages, page_shift); - KMSAN_WARN_ON(mapped); + if (mapped) { + err = mapped; + goto ret; + } mapped = __vmap_pages_range_noflush(origin_start, origin_end, prot, o_pages, page_shift); - KMSAN_WARN_ON(mapped); + if (mapped) { + err = mapped; + goto ret; + } kmsan_leave_runtime(); flush_tlb_kernel_range(shadow_start, shadow_end); flush_tlb_kernel_range(origin_start, origin_end); @@ -262,6 +270,7 @@ void kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, ret: kfree(s_pages); kfree(o_pages); + return err; } /* Allocate metadata for pages allocated at boot time. */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index a50072066221..1355d95cce1c 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -605,7 +605,11 @@ int __vmap_pages_range_noflush(unsigned long addr, unsigned long end, int vmap_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift) { - kmsan_vmap_pages_range_noflush(addr, end, prot, pages, page_shift); + int ret = kmsan_vmap_pages_range_noflush(addr, end, prot, pages, + page_shift); + + if (ret) + return ret; return __vmap_pages_range_noflush(addr, end, prot, pages, page_shift); } From fdea03e12aa2a44a7bb34144208be97fc25dfd90 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 13 Apr 2023 15:12:21 +0200 Subject: [PATCH 145/175] mm: kmsan: handle alloc failures in kmsan_ioremap_page_range() Similarly to kmsan_vmap_pages_range_noflush(), kmsan_ioremap_page_range() must also properly handle allocation/mapping failures. In the case of such, it must clean up the already created metadata mappings and return an error code, so that the error can be propagated to ioremap_page_range(). Without doing so, KMSAN may silently fail to bring the metadata for the page range into a consistent state, which will result in user-visible crashes when trying to access them. Link: https://lkml.kernel.org/r/20230413131223.4135168-2-glider@google.com Fixes: b073d7f8aee4 ("mm: kmsan: maintain KMSAN metadata for page operations") Signed-off-by: Alexander Potapenko Reported-by: Dipanjan Das Link: https://lore.kernel.org/linux-mm/CANX2M5ZRrRA64k0hOif02TjmY9kbbO2aCBPyq79es34RXZ=cAw@mail.gmail.com/ Reviewed-by: Marco Elver Cc: Christoph Hellwig Cc: Dmitry Vyukov Cc: Uladzislau Rezki (Sony) Cc: Signed-off-by: Andrew Morton --- include/linux/kmsan.h | 19 ++++++++------- mm/kmsan/hooks.c | 55 ++++++++++++++++++++++++++++++++++++------- mm/vmalloc.c | 4 ++-- 3 files changed, 59 insertions(+), 19 deletions(-) diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h index c7ff3aefc5a1..30b17647ce3c 100644 --- a/include/linux/kmsan.h +++ b/include/linux/kmsan.h @@ -160,11 +160,12 @@ void kmsan_vunmap_range_noflush(unsigned long start, unsigned long end); * @page_shift: page_shift argument passed to vmap_range_noflush(). * * KMSAN creates new metadata pages for the physical pages mapped into the - * virtual memory. + * virtual memory. Returns 0 on success, callers must check for non-zero return + * value. */ -void kmsan_ioremap_page_range(unsigned long addr, unsigned long end, - phys_addr_t phys_addr, pgprot_t prot, - unsigned int page_shift); +int kmsan_ioremap_page_range(unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int page_shift); /** * kmsan_iounmap_page_range() - Notify KMSAN about a iounmap_page_range() call. @@ -296,12 +297,12 @@ static inline void kmsan_vunmap_range_noflush(unsigned long start, { } -static inline void kmsan_ioremap_page_range(unsigned long start, - unsigned long end, - phys_addr_t phys_addr, - pgprot_t prot, - unsigned int page_shift) +static inline int kmsan_ioremap_page_range(unsigned long start, + unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int page_shift) { + return 0; } static inline void kmsan_iounmap_page_range(unsigned long start, diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c index 3807502766a3..ec0da72e65aa 100644 --- a/mm/kmsan/hooks.c +++ b/mm/kmsan/hooks.c @@ -148,35 +148,74 @@ void kmsan_vunmap_range_noflush(unsigned long start, unsigned long end) * into the virtual memory. If those physical pages already had shadow/origin, * those are ignored. */ -void kmsan_ioremap_page_range(unsigned long start, unsigned long end, - phys_addr_t phys_addr, pgprot_t prot, - unsigned int page_shift) +int kmsan_ioremap_page_range(unsigned long start, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int page_shift) { gfp_t gfp_mask = GFP_KERNEL | __GFP_ZERO; struct page *shadow, *origin; unsigned long off = 0; - int nr; + int nr, err = 0, clean = 0, mapped; if (!kmsan_enabled || kmsan_in_runtime()) - return; + return 0; nr = (end - start) / PAGE_SIZE; kmsan_enter_runtime(); - for (int i = 0; i < nr; i++, off += PAGE_SIZE) { + for (int i = 0; i < nr; i++, off += PAGE_SIZE, clean = i) { shadow = alloc_pages(gfp_mask, 1); origin = alloc_pages(gfp_mask, 1); - __vmap_pages_range_noflush( + if (!shadow || !origin) { + err = -ENOMEM; + goto ret; + } + mapped = __vmap_pages_range_noflush( vmalloc_shadow(start + off), vmalloc_shadow(start + off + PAGE_SIZE), prot, &shadow, PAGE_SHIFT); - __vmap_pages_range_noflush( + if (mapped) { + err = mapped; + goto ret; + } + shadow = NULL; + mapped = __vmap_pages_range_noflush( vmalloc_origin(start + off), vmalloc_origin(start + off + PAGE_SIZE), prot, &origin, PAGE_SHIFT); + if (mapped) { + __vunmap_range_noflush( + vmalloc_shadow(start + off), + vmalloc_shadow(start + off + PAGE_SIZE)); + err = mapped; + goto ret; + } + origin = NULL; + } + /* Page mapping loop finished normally, nothing to clean up. */ + clean = 0; + +ret: + if (clean > 0) { + /* + * Something went wrong. Clean up shadow/origin pages allocated + * on the last loop iteration, then delete mappings created + * during the previous iterations. + */ + if (shadow) + __free_pages(shadow, 1); + if (origin) + __free_pages(origin, 1); + __vunmap_range_noflush( + vmalloc_shadow(start), + vmalloc_shadow(start + clean * PAGE_SIZE)); + __vunmap_range_noflush( + vmalloc_origin(start), + vmalloc_origin(start + clean * PAGE_SIZE)); } flush_cache_vmap(vmalloc_shadow(start), vmalloc_shadow(end)); flush_cache_vmap(vmalloc_origin(start), vmalloc_origin(end)); kmsan_leave_runtime(); + return err; } void kmsan_iounmap_page_range(unsigned long start, unsigned long end) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 1355d95cce1c..31ff782d368b 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -313,8 +313,8 @@ int ioremap_page_range(unsigned long addr, unsigned long end, ioremap_max_page_shift); flush_cache_vmap(addr, end); if (!err) - kmsan_ioremap_page_range(addr, end, phys_addr, prot, - ioremap_max_page_shift); + err = kmsan_ioremap_page_range(addr, end, phys_addr, prot, + ioremap_max_page_shift); return err; } From fad8e4291da5e3243e086622df63cb952db444d8 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 14 Apr 2023 10:57:26 -0400 Subject: [PATCH 146/175] maple_tree: make maple state reusable after mas_empty_area_rev() Stop using maple state min/max for the range by passing through pointers for those values. This will allow the maple state to be reused without resetting. Also add some logic to fail out early on searching with invalid arguments. Link: https://lkml.kernel.org/r/20230414145728.4067069-1-Liam.Howlett@oracle.com Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Liam R. Howlett Reported-by: Rick Edgecombe Cc: Signed-off-by: Andrew Morton --- lib/maple_tree.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 7ff2a821a2a1..d197b49eee67 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -4965,7 +4965,8 @@ not_found: * Return: True if found in a leaf, false otherwise. * */ -static bool mas_rev_awalk(struct ma_state *mas, unsigned long size) +static bool mas_rev_awalk(struct ma_state *mas, unsigned long size, + unsigned long *gap_min, unsigned long *gap_max) { enum maple_type type = mte_node_type(mas->node); struct maple_node *node = mas_mn(mas); @@ -5030,8 +5031,8 @@ static bool mas_rev_awalk(struct ma_state *mas, unsigned long size) if (unlikely(ma_is_leaf(type))) { mas->offset = offset; - mas->min = min; - mas->max = min + gap - 1; + *gap_min = min; + *gap_max = min + gap - 1; return true; } @@ -5307,6 +5308,9 @@ int mas_empty_area(struct ma_state *mas, unsigned long min, unsigned long *pivots; enum maple_type mt; + if (min >= max) + return -EINVAL; + if (mas_is_start(mas)) mas_start(mas); else if (mas->offset >= 2) @@ -5361,6 +5365,9 @@ int mas_empty_area_rev(struct ma_state *mas, unsigned long min, { struct maple_enode *last = mas->node; + if (min >= max) + return -EINVAL; + if (mas_is_start(mas)) { mas_start(mas); mas->offset = mas_data_end(mas); @@ -5380,7 +5387,7 @@ int mas_empty_area_rev(struct ma_state *mas, unsigned long min, mas->index = min; mas->last = max; - while (!mas_rev_awalk(mas, size)) { + while (!mas_rev_awalk(mas, size, &min, &max)) { if (last == mas->node) { if (!mas_rewind_node(mas)) return -EBUSY; @@ -5395,17 +5402,9 @@ int mas_empty_area_rev(struct ma_state *mas, unsigned long min, if (unlikely(mas->offset == MAPLE_NODE_SLOTS)) return -EBUSY; - /* - * mas_rev_awalk() has set mas->min and mas->max to the gap values. If - * the maximum is outside the window we are searching, then use the last - * location in the search. - * mas->max and mas->min is the range of the gap. - * mas->index and mas->last are currently set to the search range. - */ - /* Trim the upper limit to the max. */ - if (mas->max <= mas->last) - mas->last = mas->max; + if (max <= mas->last) + mas->last = max; mas->index = mas->last - size + 1; return 0; From 06e8fd999334bcd76b4d72d7b9206d4aea89764e Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 14 Apr 2023 10:57:27 -0400 Subject: [PATCH 147/175] maple_tree: fix mas_empty_area() search The internal function of mas_awalk() was incorrectly skipping the last entry in a node, which could potentially be NULL. This is only a problem for the left-most node in the tree - otherwise that NULL would not exist. Fix mas_awalk() by using the metadata to obtain the end of the node for the loop and the logical pivot as apposed to the raw pivot value. Link: https://lkml.kernel.org/r/20230414145728.4067069-2-Liam.Howlett@oracle.com Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Liam R. Howlett Reported-by: Rick Edgecombe Cc: Signed-off-by: Andrew Morton --- lib/maple_tree.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index d197b49eee67..1281a40d5735 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -5056,10 +5056,10 @@ static inline bool mas_anode_descend(struct ma_state *mas, unsigned long size) { enum maple_type type = mte_node_type(mas->node); unsigned long pivot, min, gap = 0; - unsigned char offset; - unsigned long *gaps; - unsigned long *pivots = ma_pivots(mas_mn(mas), type); - void __rcu **slots = ma_slots(mas_mn(mas), type); + unsigned char offset, data_end; + unsigned long *gaps, *pivots; + void __rcu **slots; + struct maple_node *node; bool found = false; if (ma_is_dense(type)) { @@ -5067,13 +5067,15 @@ static inline bool mas_anode_descend(struct ma_state *mas, unsigned long size) return true; } - gaps = ma_gaps(mte_to_node(mas->node), type); + node = mas_mn(mas); + pivots = ma_pivots(node, type); + slots = ma_slots(node, type); + gaps = ma_gaps(node, type); offset = mas->offset; min = mas_safe_min(mas, pivots, offset); - for (; offset < mt_slots[type]; offset++) { - pivot = mas_safe_pivot(mas, pivots, offset, type); - if (offset && !pivot) - break; + data_end = ma_data_end(node, type, pivots, mas->max); + for (; offset <= data_end; offset++) { + pivot = mas_logical_pivot(mas, pivots, offset, type); /* Not within lower bounds */ if (mas->index > pivot) From 58c5d0d6d522112577c7eeb71d382ea642ed7be4 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 14 Apr 2023 14:59:19 -0400 Subject: [PATCH 148/175] mm/mmap: regression fix for unmapped_area{_topdown} The maple tree limits the gap returned to a window that specifically fits what was asked. This may not be optimal in the case of switching search directions or a gap that does not satisfy the requested space for other reasons. Fix the search by retrying the operation and limiting the search window in the rare occasion that a conflict occurs. Link: https://lkml.kernel.org/r/20230414185919.4175572-1-Liam.Howlett@oracle.com Fixes: 3499a13168da ("mm/mmap: use maple tree for unmapped_area{_topdown}") Signed-off-by: Liam R. Howlett Reported-by: Rick Edgecombe Cc: Signed-off-by: Andrew Morton --- mm/mmap.c | 48 +++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 43 insertions(+), 5 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index ff68a67a2a7c..d5475fbf5729 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1518,7 +1518,8 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags) */ static unsigned long unmapped_area(struct vm_unmapped_area_info *info) { - unsigned long length, gap; + unsigned long length, gap, low_limit; + struct vm_area_struct *tmp; MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); @@ -1527,12 +1528,29 @@ static unsigned long unmapped_area(struct vm_unmapped_area_info *info) if (length < info->length) return -ENOMEM; - if (mas_empty_area(&mas, info->low_limit, info->high_limit - 1, - length)) + low_limit = info->low_limit; +retry: + if (mas_empty_area(&mas, low_limit, info->high_limit - 1, length)) return -ENOMEM; gap = mas.index; gap += (info->align_offset - gap) & info->align_mask; + tmp = mas_next(&mas, ULONG_MAX); + if (tmp && (tmp->vm_flags & VM_GROWSDOWN)) { /* Avoid prev check if possible */ + if (vm_start_gap(tmp) < gap + length - 1) { + low_limit = tmp->vm_end; + mas_reset(&mas); + goto retry; + } + } else { + tmp = mas_prev(&mas, 0); + if (tmp && vm_end_gap(tmp) > gap) { + low_limit = vm_end_gap(tmp); + mas_reset(&mas); + goto retry; + } + } + return gap; } @@ -1548,7 +1566,8 @@ static unsigned long unmapped_area(struct vm_unmapped_area_info *info) */ static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) { - unsigned long length, gap; + unsigned long length, gap, high_limit, gap_end; + struct vm_area_struct *tmp; MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); /* Adjust search length to account for worst case alignment overhead */ @@ -1556,12 +1575,31 @@ static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) if (length < info->length) return -ENOMEM; - if (mas_empty_area_rev(&mas, info->low_limit, info->high_limit - 1, + high_limit = info->high_limit; +retry: + if (mas_empty_area_rev(&mas, info->low_limit, high_limit - 1, length)) return -ENOMEM; gap = mas.last + 1 - info->length; gap -= (gap - info->align_offset) & info->align_mask; + gap_end = mas.last; + tmp = mas_next(&mas, ULONG_MAX); + if (tmp && (tmp->vm_flags & VM_GROWSDOWN)) { /* Avoid prev check if possible */ + if (vm_start_gap(tmp) <= gap_end) { + high_limit = vm_start_gap(tmp); + mas_reset(&mas); + goto retry; + } + } else { + tmp = mas_prev(&mas, 0); + if (tmp && vm_end_gap(tmp) > gap) { + high_limit = tmp->vm_start; + mas_reset(&mas); + goto retry; + } + } + return gap; } From 4d73ba5fa710fe7d432e0b271e6fecd252aef66e Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 14 Apr 2023 15:14:29 +0100 Subject: [PATCH 149/175] mm: page_alloc: skip regions with hugetlbfs pages when allocating 1G pages A bug was reported by Yuanxi Liu where allocating 1G pages at runtime is taking an excessive amount of time for large amounts of memory. Further testing allocating huge pages that the cost is linear i.e. if allocating 1G pages in batches of 10 then the time to allocate nr_hugepages from 10->20->30->etc increases linearly even though 10 pages are allocated at each step. Profiles indicated that much of the time is spent checking the validity within already existing huge pages and then attempting a migration that fails after isolating the range, draining pages and a whole lot of other useless work. Commit eb14d4eefdc4 ("mm,page_alloc: drop unnecessary checks from pfn_range_valid_contig") removed two checks, one which ignored huge pages for contiguous allocations as huge pages can sometimes migrate. While there may be value on migrating a 2M page to satisfy a 1G allocation, it's potentially expensive if the 1G allocation fails and it's pointless to try moving a 1G page for a new 1G allocation or scan the tail pages for valid PFNs. Reintroduce the PageHuge check and assume any contiguous region with hugetlbfs pages is unsuitable for a new 1G allocation. The hpagealloc test allocates huge pages in batches and reports the average latency per page over time. This test happens just after boot when fragmentation is not an issue. Units are in milliseconds. hpagealloc 6.3.0-rc6 6.3.0-rc6 6.3.0-rc6 vanilla hugeallocrevert-v1r1 hugeallocsimple-v1r2 Min Latency 26.42 ( 0.00%) 5.07 ( 80.82%) 18.94 ( 28.30%) 1st-qrtle Latency 356.61 ( 0.00%) 5.34 ( 98.50%) 19.85 ( 94.43%) 2nd-qrtle Latency 697.26 ( 0.00%) 5.47 ( 99.22%) 20.44 ( 97.07%) 3rd-qrtle Latency 972.94 ( 0.00%) 5.50 ( 99.43%) 20.81 ( 97.86%) Max-1 Latency 26.42 ( 0.00%) 5.07 ( 80.82%) 18.94 ( 28.30%) Max-5 Latency 82.14 ( 0.00%) 5.11 ( 93.78%) 19.31 ( 76.49%) Max-10 Latency 150.54 ( 0.00%) 5.20 ( 96.55%) 19.43 ( 87.09%) Max-90 Latency 1164.45 ( 0.00%) 5.53 ( 99.52%) 20.97 ( 98.20%) Max-95 Latency 1223.06 ( 0.00%) 5.55 ( 99.55%) 21.06 ( 98.28%) Max-99 Latency 1278.67 ( 0.00%) 5.57 ( 99.56%) 22.56 ( 98.24%) Max Latency 1310.90 ( 0.00%) 8.06 ( 99.39%) 26.62 ( 97.97%) Amean Latency 678.36 ( 0.00%) 5.44 * 99.20%* 20.44 * 96.99%* 6.3.0-rc6 6.3.0-rc6 6.3.0-rc6 vanilla revert-v1 hugeallocfix-v2 Duration User 0.28 0.27 0.30 Duration System 808.66 17.77 35.99 Duration Elapsed 830.87 18.08 36.33 The vanilla kernel is poor, taking up to 1.3 second to allocate a huge page and almost 10 minutes in total to run the test. Reverting the problematic commit reduces it to 8ms at worst and the patch takes 26ms. This patch fixes the main issue with skipping huge pages but leaves the page_count() out because a page with an elevated count potentially can migrate. BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=217022 Link: https://lkml.kernel.org/r/20230414141429.pwgieuwluxwez3rj@techsingularity.net Fixes: eb14d4eefdc4 ("mm,page_alloc: drop unnecessary checks from pfn_range_valid_contig") Signed-off-by: Mel Gorman Reported-by: Yuanxi Liu Acked-by: Vlastimil Babka Reviewed-by: David Hildenbrand Acked-by: Michal Hocko Reviewed-by: Oscar Salvador Cc: Matthew Wilcox Cc: Signed-off-by: Andrew Morton --- mm/page_alloc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e8b4f294d763..8e39705c7bdc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -9466,6 +9466,9 @@ static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn, if (PageReserved(page)) return false; + + if (PageHuge(page)) + return false; } return true; } From ef832747a82dfbc22a3702219cc716f449b24e4a Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Tue, 18 Apr 2023 02:35:13 +0900 Subject: [PATCH 150/175] nilfs2: initialize unused bytes in segment summary blocks Syzbot still reports uninit-value in nilfs_add_checksums_on_logs() for KMSAN enabled kernels after applying commit 7397031622e0 ("nilfs2: initialize "struct nilfs_binfo_dat"->bi_pad field"). This is because the unused bytes at the end of each block in segment summaries are not initialized. So this fixes the issue by padding the unused bytes with null bytes. Link: https://lkml.kernel.org/r/20230417173513.12598-1-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi Tested-by: Ryusuke Konishi Reported-by: syzbot+048585f3f4227bb2b49b@syzkaller.appspotmail.com Link: https://syzkaller.appspot.com/bug?extid=048585f3f4227bb2b49b Cc: Alexander Potapenko Cc: Signed-off-by: Andrew Morton --- fs/nilfs2/segment.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 6ad41390fa74..228659612c0d 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -430,6 +430,23 @@ static int nilfs_segctor_reset_segment_buffer(struct nilfs_sc_info *sci) return 0; } +/** + * nilfs_segctor_zeropad_segsum - zero pad the rest of the segment summary area + * @sci: segment constructor object + * + * nilfs_segctor_zeropad_segsum() zero-fills unallocated space at the end of + * the current segment summary block. + */ +static void nilfs_segctor_zeropad_segsum(struct nilfs_sc_info *sci) +{ + struct nilfs_segsum_pointer *ssp; + + ssp = sci->sc_blk_cnt > 0 ? &sci->sc_binfo_ptr : &sci->sc_finfo_ptr; + if (ssp->offset < ssp->bh->b_size) + memset(ssp->bh->b_data + ssp->offset, 0, + ssp->bh->b_size - ssp->offset); +} + static int nilfs_segctor_feed_segment(struct nilfs_sc_info *sci) { sci->sc_nblk_this_inc += sci->sc_curseg->sb_sum.nblocks; @@ -438,6 +455,7 @@ static int nilfs_segctor_feed_segment(struct nilfs_sc_info *sci) * The current segment is filled up * (internal code) */ + nilfs_segctor_zeropad_segsum(sci); sci->sc_curseg = NILFS_NEXT_SEGBUF(sci->sc_curseg); return nilfs_segctor_reset_segment_buffer(sci); } @@ -542,6 +560,7 @@ static int nilfs_segctor_add_file_block(struct nilfs_sc_info *sci, goto retry; } if (unlikely(required)) { + nilfs_segctor_zeropad_segsum(sci); err = nilfs_segbuf_extend_segsum(segbuf); if (unlikely(err)) goto failed; @@ -1533,6 +1552,7 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci, nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA); sci->sc_stage = prev_stage; } + nilfs_segctor_zeropad_segsum(sci); nilfs_segctor_truncate_segments(sci, sci->sc_curseg, nilfs->ns_sufile); return 0; From ff9f3d7aefddbaa9a9b0f18f83e4319b5cd0e63e Mon Sep 17 00:00:00 2001 From: Qing Zhang Date: Wed, 19 Apr 2023 12:07:27 +0800 Subject: [PATCH 151/175] LoongArch: Adjust user_watch_state for explicit alignment This is done in order to easily calculate the number of breakpoints in hw_break_get()/hw_break_set(). Signed-off-by: Qing Zhang Signed-off-by: Huacai Chen --- arch/loongarch/include/uapi/asm/ptrace.h | 3 ++- arch/loongarch/kernel/ptrace.c | 15 +++++++++++---- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/arch/loongarch/include/uapi/asm/ptrace.h b/arch/loongarch/include/uapi/asm/ptrace.h index cc48ed262021..82d811b5c6e9 100644 --- a/arch/loongarch/include/uapi/asm/ptrace.h +++ b/arch/loongarch/include/uapi/asm/ptrace.h @@ -47,11 +47,12 @@ struct user_fp_state { }; struct user_watch_state { - uint16_t dbg_info; + uint64_t dbg_info; struct { uint64_t addr; uint64_t mask; uint32_t ctrl; + uint32_t pad; } dbg_regs[8]; }; diff --git a/arch/loongarch/kernel/ptrace.c b/arch/loongarch/kernel/ptrace.c index 06bceae7d104..bb2e1af135ea 100644 --- a/arch/loongarch/kernel/ptrace.c +++ b/arch/loongarch/kernel/ptrace.c @@ -391,10 +391,10 @@ static int ptrace_hbp_fill_attr_ctrl(unsigned int note_type, return 0; } -static int ptrace_hbp_get_resource_info(unsigned int note_type, u16 *info) +static int ptrace_hbp_get_resource_info(unsigned int note_type, u64 *info) { u8 num; - u16 reg = 0; + u64 reg = 0; switch (note_type) { case NT_LOONGARCH_HW_BREAK: @@ -524,15 +524,16 @@ static int ptrace_hbp_set_addr(unsigned int note_type, return modify_user_hw_breakpoint(bp, &attr); } -#define PTRACE_HBP_CTRL_SZ sizeof(u32) #define PTRACE_HBP_ADDR_SZ sizeof(u64) #define PTRACE_HBP_MASK_SZ sizeof(u64) +#define PTRACE_HBP_CTRL_SZ sizeof(u32) +#define PTRACE_HBP_PAD_SZ sizeof(u32) static int hw_break_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { - u16 info; + u64 info; u32 ctrl; u64 addr, mask; int ret, idx = 0; @@ -562,6 +563,7 @@ static int hw_break_get(struct task_struct *target, membuf_store(&to, addr); membuf_store(&to, mask); membuf_store(&to, ctrl); + membuf_zero(&to, sizeof(u32)); idx++; } @@ -620,6 +622,11 @@ static int hw_break_set(struct task_struct *target, if (ret) return ret; offset += PTRACE_HBP_CTRL_SZ; + + user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf, + offset, offset + PTRACE_HBP_PAD_SZ); + offset += PTRACE_HBP_PAD_SZ; + idx++; } From e32b3b8222204df8a2642a770f79ec2d7086faed Mon Sep 17 00:00:00 2001 From: Qing Zhang Date: Wed, 19 Apr 2023 12:07:27 +0800 Subject: [PATCH 152/175] LoongArch: Adjust user_regset_copyin parameter to the correct offset Ensure that user_watch_state can be set correctly by the user. Signed-off-by: Qing Zhang Signed-off-by: Huacai Chen --- arch/loongarch/kernel/ptrace.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/loongarch/kernel/ptrace.c b/arch/loongarch/kernel/ptrace.c index bb2e1af135ea..5fcffb452367 100644 --- a/arch/loongarch/kernel/ptrace.c +++ b/arch/loongarch/kernel/ptrace.c @@ -546,7 +546,7 @@ static int hw_break_get(struct task_struct *target, membuf_write(&to, &info, sizeof(info)); - /* (address, ctrl) registers */ + /* (address, mask, ctrl) registers */ while (to.left) { ret = ptrace_hbp_get_addr(note_type, target, idx, &addr); if (ret) @@ -584,7 +584,7 @@ static int hw_break_set(struct task_struct *target, offset = offsetof(struct user_watch_state, dbg_regs); user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf, 0, offset); - /* (address, ctrl) registers */ + /* (address, mask, ctrl) registers */ limit = regset->n * regset->size; while (count && offset < limit) { if (count < PTRACE_HBP_ADDR_SZ) @@ -604,7 +604,7 @@ static int hw_break_set(struct task_struct *target, break; ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &mask, - offset, offset + PTRACE_HBP_ADDR_SZ); + offset, offset + PTRACE_HBP_MASK_SZ); if (ret) return ret; @@ -613,8 +613,8 @@ static int hw_break_set(struct task_struct *target, return ret; offset += PTRACE_HBP_MASK_SZ; - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &mask, - offset, offset + PTRACE_HBP_MASK_SZ); + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &ctrl, + offset, offset + PTRACE_HBP_CTRL_SZ); if (ret) return ret; From 370a3b8f58743eceb97c5256538d6048c26d2d03 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Wed, 19 Apr 2023 12:07:27 +0800 Subject: [PATCH 153/175] LoongArch: Check unwind_error() in arch_stack_walk() We can see the following messages with CONFIG_PROVE_LOCKING=y on LoongArch: BUG: MAX_STACK_TRACE_ENTRIES too low! turning off the locking correctness validator. This is because stack_trace_save() returns a big value after call arch_stack_walk(), here is the call trace: save_trace() stack_trace_save() arch_stack_walk() stack_trace_consume_entry() arch_stack_walk() should return immediately if unwind_next_frame() failed, no need to do the useless loops to increase the value of c->len in stack_trace_consume_entry(), then we can fix the above problem. Cc: stable@vger.kernel.org Reported-by: Guenter Roeck Link: https://lore.kernel.org/all/8a44ad71-68d2-4926-892f-72bfc7a67e2a@roeck-us.net/ Signed-off-by: Tiezhu Yang Signed-off-by: Huacai Chen --- arch/loongarch/kernel/stacktrace.c | 2 +- arch/loongarch/kernel/unwind.c | 1 + arch/loongarch/kernel/unwind_prologue.c | 4 +++- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/loongarch/kernel/stacktrace.c b/arch/loongarch/kernel/stacktrace.c index 3a690f96f00c..2463d2fea21f 100644 --- a/arch/loongarch/kernel/stacktrace.c +++ b/arch/loongarch/kernel/stacktrace.c @@ -30,7 +30,7 @@ void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie, regs->regs[1] = 0; for (unwind_start(&state, task, regs); - !unwind_done(&state); unwind_next_frame(&state)) { + !unwind_done(&state) && !unwind_error(&state); unwind_next_frame(&state)) { addr = unwind_get_return_address(&state); if (!addr || !consume_entry(cookie, addr)) break; diff --git a/arch/loongarch/kernel/unwind.c b/arch/loongarch/kernel/unwind.c index a463d6961344..ba324ba76fa1 100644 --- a/arch/loongarch/kernel/unwind.c +++ b/arch/loongarch/kernel/unwind.c @@ -28,5 +28,6 @@ bool default_next_frame(struct unwind_state *state) } while (!get_stack_info(state->sp, state->task, info)); + state->error = true; return false; } diff --git a/arch/loongarch/kernel/unwind_prologue.c b/arch/loongarch/kernel/unwind_prologue.c index 9095fde8e55d..55afc27320e1 100644 --- a/arch/loongarch/kernel/unwind_prologue.c +++ b/arch/loongarch/kernel/unwind_prologue.c @@ -211,7 +211,7 @@ static bool next_frame(struct unwind_state *state) pc = regs->csr_era; if (user_mode(regs) || !__kernel_text_address(pc)) - return false; + goto out; state->first = true; state->pc = pc; @@ -226,6 +226,8 @@ static bool next_frame(struct unwind_state *state) } while (!get_stack_info(state->sp, state->task, info)); +out: + state->error = true; return false; } From afca6e06494c75e25a71ccb4926459944e23098b Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Wed, 19 Apr 2023 12:07:27 +0800 Subject: [PATCH 154/175] LoongArch: Clean up plat_swiotlb_setup() related code After commit c78c43fe7d42 ("LoongArch: Use acpi_arch_dma_setup() and remove ARCH_HAS_PHYS_TO_DMA"), plat_swiotlb_setup() has been deleted, so clean up the related code. Signed-off-by: Tiezhu Yang Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/bootinfo.h | 1 - arch/loongarch/kernel/setup.c | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/loongarch/include/asm/bootinfo.h b/arch/loongarch/include/asm/bootinfo.h index 0051b526ac6d..c60796869b2b 100644 --- a/arch/loongarch/include/asm/bootinfo.h +++ b/arch/loongarch/include/asm/bootinfo.h @@ -13,7 +13,6 @@ const char *get_system_type(void); extern void init_environ(void); extern void memblock_init(void); extern void platform_init(void); -extern void plat_swiotlb_setup(void); extern int __init init_numa_memory(void); struct loongson_board_info { diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c index 27f71f9531e1..4444b13418f0 100644 --- a/arch/loongarch/kernel/setup.c +++ b/arch/loongarch/kernel/setup.c @@ -389,8 +389,8 @@ static void __init arch_mem_init(char **cmdline_p) /* * In order to reduce the possibility of kernel panic when failed to * get IO TLB memory under CONFIG_SWIOTLB, it is better to allocate - * low memory as small as possible before plat_swiotlb_setup(), so - * make sparse_init() using top-down allocation. + * low memory as small as possible before swiotlb_init(), so make + * sparse_init() using top-down allocation. */ memblock_set_bottom_up(false); sparse_init(); From 213ef669d1e536e57cdff8ddc2d3b9347b98e35f Mon Sep 17 00:00:00 2001 From: Enze Li Date: Wed, 19 Apr 2023 12:07:27 +0800 Subject: [PATCH 155/175] LoongArch: Replace hard-coded values in comments with VALEN According to LoongArch documentation [1], CSR.PGDL and CSR.PGDH are concerned with the VA's MSB which is VALEN-1 instead of always being 47. Fix comments to avoid misleading others. [1] https://loongson.github.io/LoongArch-Documentation/LoongArch-Vol1-EN.html#page-global-directory-base-address-for-lower-half-address-space Reviewed-by: WANG Xuerui Signed-off-by: Enze Li Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/loongarch.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/loongarch/include/asm/loongarch.h b/arch/loongarch/include/asm/loongarch.h index 8c2969965c3c..83da5d29e2d1 100644 --- a/arch/loongarch/include/asm/loongarch.h +++ b/arch/loongarch/include/asm/loongarch.h @@ -423,9 +423,9 @@ static __always_inline void iocsr_write64(u64 val, u32 reg) #define CSR_ASID_ASID_WIDTH 10 #define CSR_ASID_ASID (_ULCAST_(0x3ff) << CSR_ASID_ASID_SHIFT) -#define LOONGARCH_CSR_PGDL 0x19 /* Page table base address when VA[47] = 0 */ +#define LOONGARCH_CSR_PGDL 0x19 /* Page table base address when VA[VALEN-1] = 0 */ -#define LOONGARCH_CSR_PGDH 0x1a /* Page table base address when VA[47] = 1 */ +#define LOONGARCH_CSR_PGDH 0x1a /* Page table base address when VA[VALEN-1] = 1 */ #define LOONGARCH_CSR_PGD 0x1b /* Page table base */ From b5533e990dd1de5872a34cba2f4f7f508c9b2ec3 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Wed, 19 Apr 2023 12:07:34 +0800 Subject: [PATCH 156/175] tools/loongarch: Use __SIZEOF_LONG__ to define __BITS_PER_LONG Although __SIZEOF_POINTER__ is equal to _SIZEOF_LONG__ on LoongArch, it is better to use __SIZEOF_LONG__ to define __BITS_PER_LONG to keep consistent between arch/loongarch/include/uapi/asm/bitsperlong.h and tools/arch/loongarch/include/uapi/asm/bitsperlong.h. Signed-off-by: Tiezhu Yang Signed-off-by: Huacai Chen --- tools/arch/loongarch/include/uapi/asm/bitsperlong.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/arch/loongarch/include/uapi/asm/bitsperlong.h b/tools/arch/loongarch/include/uapi/asm/bitsperlong.h index d4e32b3d4843..00b4ba1e5cdf 100644 --- a/tools/arch/loongarch/include/uapi/asm/bitsperlong.h +++ b/tools/arch/loongarch/include/uapi/asm/bitsperlong.h @@ -2,7 +2,7 @@ #ifndef __ASM_LOONGARCH_BITSPERLONG_H #define __ASM_LOONGARCH_BITSPERLONG_H -#define __BITS_PER_LONG (__SIZEOF_POINTER__ * 8) +#define __BITS_PER_LONG (__SIZEOF_LONG__ * 8) #include From c484fcc058bada604d7e4e5228d4affb646ddbc2 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 17 Apr 2023 09:12:16 +0300 Subject: [PATCH 157/175] bonding: Fix memory leak when changing bond type to Ethernet When a net device is put administratively up, its 'IFF_UP' flag is set (if not set already) and a 'NETDEV_UP' notification is emitted, which causes the 8021q driver to add VLAN ID 0 on the device. The reverse happens when a net device is put administratively down. When changing the type of a bond to Ethernet, its 'IFF_UP' flag is incorrectly cleared, resulting in the kernel skipping the above process and VLAN ID 0 being leaked [1]. Fix by restoring the flag when changing the type to Ethernet, in a similar fashion to the restoration of the 'IFF_SLAVE' flag. The issue can be reproduced using the script in [2], with example out before and after the fix in [3]. [1] unreferenced object 0xffff888103479900 (size 256): comm "ip", pid 329, jiffies 4294775225 (age 28.561s) hex dump (first 32 bytes): 00 a0 0c 15 81 88 ff ff 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] kmalloc_trace+0x2a/0xe0 [] vlan_vid_add+0x30c/0x790 [] vlan_device_event+0x1491/0x21a0 [] notifier_call_chain+0xbe/0x1f0 [] call_netdevice_notifiers_info+0xba/0x150 [] __dev_notify_flags+0x132/0x2e0 [] dev_change_flags+0x11f/0x180 [] do_setlink+0xb96/0x4060 [] __rtnl_newlink+0xc0a/0x18a0 [] rtnl_newlink+0x6c/0xa0 [] rtnetlink_rcv_msg+0x43e/0xe00 [] netlink_rcv_skb+0x170/0x440 [] netlink_unicast+0x53f/0x810 [] netlink_sendmsg+0x96b/0xe90 [] ____sys_sendmsg+0x30f/0xa70 [] ___sys_sendmsg+0x13a/0x1e0 unreferenced object 0xffff88810f6a83e0 (size 32): comm "ip", pid 329, jiffies 4294775225 (age 28.561s) hex dump (first 32 bytes): a0 99 47 03 81 88 ff ff a0 99 47 03 81 88 ff ff ..G.......G..... 81 00 00 00 01 00 00 00 cc cc cc cc cc cc cc cc ................ backtrace: [] kmalloc_trace+0x2a/0xe0 [] vlan_vid_add+0x409/0x790 [] vlan_device_event+0x1491/0x21a0 [] notifier_call_chain+0xbe/0x1f0 [] call_netdevice_notifiers_info+0xba/0x150 [] __dev_notify_flags+0x132/0x2e0 [] dev_change_flags+0x11f/0x180 [] do_setlink+0xb96/0x4060 [] __rtnl_newlink+0xc0a/0x18a0 [] rtnl_newlink+0x6c/0xa0 [] rtnetlink_rcv_msg+0x43e/0xe00 [] netlink_rcv_skb+0x170/0x440 [] netlink_unicast+0x53f/0x810 [] netlink_sendmsg+0x96b/0xe90 [] ____sys_sendmsg+0x30f/0xa70 [] ___sys_sendmsg+0x13a/0x1e0 [2] ip link add name t-nlmon type nlmon ip link add name t-dummy type dummy ip link add name t-bond type bond mode active-backup ip link set dev t-bond up ip link set dev t-nlmon master t-bond ip link set dev t-nlmon nomaster ip link show dev t-bond ip link set dev t-dummy master t-bond ip link show dev t-bond ip link del dev t-bond ip link del dev t-dummy ip link del dev t-nlmon [3] Before: 12: t-bond: mtu 1500 qdisc noqueue state DOWN mode DEFAULT group default qlen 1000 link/netlink 12: t-bond: mtu 1500 qdisc noqueue state UP mode DEFAULT group default qlen 1000 link/ether 46:57:39:a4:46:a2 brd ff:ff:ff:ff:ff:ff After: 12: t-bond: mtu 1500 qdisc noqueue state DOWN mode DEFAULT group default qlen 1000 link/netlink 12: t-bond: mtu 1500 qdisc noqueue state UP mode DEFAULT group default qlen 1000 link/ether 66:48:7b:74:b6:8a brd ff:ff:ff:ff:ff:ff Fixes: e36b9d16c6a6 ("bonding: clean muticast addresses when device changes type") Fixes: 75c78500ddad ("bonding: remap muticast addresses without using dev_close() and dev_open()") Fixes: 9ec7eb60dcbc ("bonding: restore IFF_MASTER/SLAVE flags on bond enslave ether type change") Reported-by: Mirsad Goran Todorovac Link: https://lore.kernel.org/netdev/78a8a03b-6070-3e6b-5042-f848dab16fb8@alu.unizg.hr/ Tested-by: Mirsad Goran Todorovac Signed-off-by: Ido Schimmel Acked-by: Jay Vosburgh Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 8cc9a74789b7..7a7d584f378a 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1777,14 +1777,15 @@ void bond_lower_state_changed(struct slave *slave) /* The bonding driver uses ether_setup() to convert a master bond device * to ARPHRD_ETHER, that resets the target netdevice's flags so we always - * have to restore the IFF_MASTER flag, and only restore IFF_SLAVE if it was set + * have to restore the IFF_MASTER flag, and only restore IFF_SLAVE and IFF_UP + * if they were set */ static void bond_ether_setup(struct net_device *bond_dev) { - unsigned int slave_flag = bond_dev->flags & IFF_SLAVE; + unsigned int flags = bond_dev->flags & (IFF_SLAVE | IFF_UP); ether_setup(bond_dev); - bond_dev->flags |= IFF_MASTER | slave_flag; + bond_dev->flags |= IFF_MASTER | flags; bond_dev->priv_flags &= ~IFF_TX_SKB_SHARING; } From 6f4833383e8514ea796d094e05c24889b8997fde Mon Sep 17 00:00:00 2001 From: Seiji Nishikawa Date: Mon, 17 Apr 2023 21:21:27 +0900 Subject: [PATCH 158/175] net: vmxnet3: Fix NULL pointer dereference in vmxnet3_rq_rx_complete() When vmxnet3_rq_create() fails to allocate rq->data_ring.base due to page allocation failure, subsequent call to vmxnet3_rq_rx_complete() can result in NULL pointer dereference. To fix this bug, check not only that rxDataRingUsed is true but also that adapter->rxdataring_enabled is true before calling memcpy() in vmxnet3_rq_rx_complete(). [1728352.477993] ethtool: page allocation failure: order:9, mode:0x6000c0(GFP_KERNEL), nodemask=(null),cpuset=/,mems_allowed=0 ... [1728352.478009] Call Trace: [1728352.478028] dump_stack+0x41/0x60 [1728352.478035] warn_alloc.cold.120+0x7b/0x11b [1728352.478038] ? _cond_resched+0x15/0x30 [1728352.478042] ? __alloc_pages_direct_compact+0x15f/0x170 [1728352.478043] __alloc_pages_slowpath+0xcd3/0xd10 [1728352.478047] __alloc_pages_nodemask+0x2e2/0x320 [1728352.478049] __dma_direct_alloc_pages.constprop.25+0x8a/0x120 [1728352.478053] dma_direct_alloc+0x5a/0x2a0 [1728352.478056] vmxnet3_rq_create.part.57+0x17c/0x1f0 [vmxnet3] ... [1728352.478188] vmxnet3 0000:0b:00.0 ens192: rx data ring will be disabled ... [1728352.515347] BUG: unable to handle kernel NULL pointer dereference at 0000000000000034 ... [1728352.515440] RIP: 0010:memcpy_orig+0x54/0x130 ... [1728352.515655] Call Trace: [1728352.515665] [1728352.515672] vmxnet3_rq_rx_complete+0x419/0xef0 [vmxnet3] [1728352.515690] vmxnet3_poll_rx_only+0x31/0xa0 [vmxnet3] ... Signed-off-by: Seiji Nishikawa Signed-off-by: David S. Miller --- drivers/net/vmxnet3/vmxnet3_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c index da488cbb0542..f2b76ee866a4 100644 --- a/drivers/net/vmxnet3/vmxnet3_drv.c +++ b/drivers/net/vmxnet3/vmxnet3_drv.c @@ -1504,7 +1504,7 @@ vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq, goto rcd_done; } - if (rxDataRingUsed) { + if (rxDataRingUsed && adapter->rxdataring_enabled) { size_t sz; BUG_ON(rcd->len > rq->data_ring.desc_size); From 4e006c7a6dac0ead4c1bf606000aa90a372fc253 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Mon, 17 Apr 2023 09:00:52 -0400 Subject: [PATCH 159/175] net: rpl: fix rpl header size calculation This patch fixes a missing 8 byte for the header size calculation. The ipv6_rpl_srh_size() is used to check a skb_pull() on skb->data which points to skb_transport_header(). Currently we only check on the calculated addresses fields using CmprI and CmprE fields, see: https://www.rfc-editor.org/rfc/rfc6554#section-3 there is however a missing 8 byte inside the calculation which stands for the fields before the addresses field. Those 8 bytes are represented by sizeof(struct ipv6_rpl_sr_hdr) expression. Fixes: 8610c7c6e3bd ("net: ipv6: add support for rpl sr exthdr") Signed-off-by: Alexander Aring Reported-by: maxpl0it Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/rpl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv6/rpl.c b/net/ipv6/rpl.c index 488aec9e1a74..d1876f192225 100644 --- a/net/ipv6/rpl.c +++ b/net/ipv6/rpl.c @@ -32,7 +32,8 @@ static void *ipv6_rpl_segdata_pos(const struct ipv6_rpl_sr_hdr *hdr, int i) size_t ipv6_rpl_srh_size(unsigned char n, unsigned char cmpri, unsigned char cmpre) { - return (n * IPV6_PFXTAIL_LEN(cmpri)) + IPV6_PFXTAIL_LEN(cmpre); + return sizeof(struct ipv6_rpl_sr_hdr) + (n * IPV6_PFXTAIL_LEN(cmpri)) + + IPV6_PFXTAIL_LEN(cmpre); } void ipv6_rpl_srh_decompress(struct ipv6_rpl_sr_hdr *outhdr, From 2a6a870e44dd88f1a6a2893c65ef756a9edfb4c7 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 17 Apr 2023 16:00:40 +0200 Subject: [PATCH 160/175] mptcp: stops worker on unaccepted sockets at listener close This is a partial revert of the blamed commit, with a relevant change: mptcp_subflow_queue_clean() now just change the msk socket status and stop the worker, so that the UaF issue addressed by the blamed commit is not re-introduced. The above prevents the mptcp worker from running concurrently with inet_csk_listen_stop(), as such race would trigger a warning, as reported by Christoph: RSP: 002b:00007f784fe09cd8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e WARNING: CPU: 0 PID: 25807 at net/ipv4/inet_connection_sock.c:1387 inet_csk_listen_stop+0x664/0x870 net/ipv4/inet_connection_sock.c:1387 RAX: ffffffffffffffda RBX: 00000000006bc050 RCX: 00007f7850afd6a9 RDX: 0000000000000000 RSI: 0000000020000340 RDI: 0000000000000004 Modules linked in: RBP: 0000000000000002 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00000000006bc05c R13: fffffffffffffea8 R14: 00000000006bc050 R15: 000000000001fe40 CPU: 0 PID: 25807 Comm: syz-executor.7 Not tainted 6.2.0-g778e54711659 #7 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-2.el7 04/01/2014 RIP: 0010:inet_csk_listen_stop+0x664/0x870 net/ipv4/inet_connection_sock.c:1387 RAX: 0000000000000000 RBX: ffff888100dfbd40 RCX: 0000000000000000 RDX: ffff8881363aab80 RSI: ffffffff81c494f4 RDI: 0000000000000005 RBP: ffff888126dad080 R08: 0000000000000005 R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000000 R12: ffff888100dfe040 R13: 0000000000000001 R14: 0000000000000000 R15: ffff888100dfbdd8 FS: 00007f7850a2c800(0000) GS:ffff88813bc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000001b32d26000 CR3: 000000012fdd8006 CR4: 0000000000770ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: __tcp_close+0x5b2/0x620 net/ipv4/tcp.c:2875 __mptcp_close_ssk+0x145/0x3d0 net/mptcp/protocol.c:2427 mptcp_destroy_common+0x8a/0x1c0 net/mptcp/protocol.c:3277 mptcp_destroy+0x41/0x60 net/mptcp/protocol.c:3304 __mptcp_destroy_sock+0x56/0x140 net/mptcp/protocol.c:2965 __mptcp_close+0x38f/0x4a0 net/mptcp/protocol.c:3057 mptcp_close+0x24/0xe0 net/mptcp/protocol.c:3072 inet_release+0x53/0xa0 net/ipv4/af_inet.c:429 __sock_release+0x4e/0xf0 net/socket.c:651 sock_close+0x15/0x20 net/socket.c:1393 __fput+0xff/0x420 fs/file_table.c:321 task_work_run+0x8b/0xe0 kernel/task_work.c:179 resume_user_mode_work include/linux/resume_user_mode.h:49 [inline] exit_to_user_mode_loop kernel/entry/common.c:171 [inline] exit_to_user_mode_prepare+0x113/0x120 kernel/entry/common.c:203 __syscall_exit_to_user_mode_work kernel/entry/common.c:285 [inline] syscall_exit_to_user_mode+0x1d/0x40 kernel/entry/common.c:296 do_syscall_64+0x46/0x90 arch/x86/entry/common.c:86 entry_SYSCALL_64_after_hwframe+0x72/0xdc RIP: 0033:0x7f7850af70dc RAX: 0000000000000000 RBX: 0000000000000004 RCX: 00007f7850af70dc RDX: 00007f7850a2c800 RSI: 0000000000000002 RDI: 0000000000000003 RBP: 00000000006bd980 R08: 0000000000000000 R09: 00000000000018a0 R10: 00000000316338a4 R11: 0000000000000293 R12: 0000000000211e31 R13: 00000000006bc05c R14: 00007f785062c000 R15: 0000000000211af0 Fixes: 0a3f4f1f9c27 ("mptcp: fix UaF in listener shutdown") Cc: stable@vger.kernel.org Reported-by: Christoph Paasch Link: https://github.com/multipath-tcp/mptcp_net-next/issues/371 Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 6 +++- net/mptcp/protocol.h | 1 + net/mptcp/subflow.c | 72 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 78 insertions(+), 1 deletion(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 06c5872e3b00..5181fb91595b 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2365,8 +2365,12 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, mptcp_subflow_drop_ctx(ssk); } else { /* otherwise tcp will dispose of the ssk and subflow ctx */ - if (ssk->sk_state == TCP_LISTEN) + if (ssk->sk_state == TCP_LISTEN) { + tcp_set_state(ssk, TCP_CLOSE); + mptcp_subflow_queue_clean(sk, ssk); + inet_csk_listen_stop(ssk); mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CLOSED); + } __tcp_close(ssk, 0); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 339a6f072989..3a2db1b862dd 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -629,6 +629,7 @@ void mptcp_close_ssk(struct sock *sk, struct sock *ssk, struct mptcp_subflow_context *subflow); void __mptcp_subflow_send_ack(struct sock *ssk); void mptcp_subflow_reset(struct sock *ssk); +void mptcp_subflow_queue_clean(struct sock *sk, struct sock *ssk); void mptcp_sock_graft(struct sock *sk, struct socket *parent); struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk); bool __mptcp_close(struct sock *sk, long timeout); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index d34588850545..bf5e5c72b5ee 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1819,6 +1819,78 @@ static void subflow_state_change(struct sock *sk) } } +void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_ssk) +{ + struct request_sock_queue *queue = &inet_csk(listener_ssk)->icsk_accept_queue; + struct mptcp_sock *msk, *next, *head = NULL; + struct request_sock *req; + + /* build a list of all unaccepted mptcp sockets */ + spin_lock_bh(&queue->rskq_lock); + for (req = queue->rskq_accept_head; req; req = req->dl_next) { + struct mptcp_subflow_context *subflow; + struct sock *ssk = req->sk; + + if (!sk_is_mptcp(ssk)) + continue; + + subflow = mptcp_subflow_ctx(ssk); + if (!subflow || !subflow->conn) + continue; + + /* skip if already in list */ + msk = mptcp_sk(subflow->conn); + if (msk->dl_next || msk == head) + continue; + + sock_hold(subflow->conn); + msk->dl_next = head; + head = msk; + } + spin_unlock_bh(&queue->rskq_lock); + if (!head) + return; + + /* can't acquire the msk socket lock under the subflow one, + * or will cause ABBA deadlock + */ + release_sock(listener_ssk); + + for (msk = head; msk; msk = next) { + struct sock *sk = (struct sock *)msk; + + lock_sock_nested(sk, SINGLE_DEPTH_NESTING); + next = msk->dl_next; + msk->dl_next = NULL; + + /* prevent the stack from later re-schedule the worker for + * this socket + */ + inet_sk_state_store(sk, TCP_CLOSE); + release_sock(sk); + + /* lockdep will report a false positive ABBA deadlock + * between cancel_work_sync and the listener socket. + * The involved locks belong to different sockets WRT + * the existing AB chain. + * Using a per socket key is problematic as key + * deregistration requires process context and must be + * performed at socket disposal time, in atomic + * context. + * Just tell lockdep to consider the listener socket + * released here. + */ + mutex_release(&listener_sk->sk_lock.dep_map, _RET_IP_); + mptcp_cancel_work(sk); + mutex_acquire(&listener_sk->sk_lock.dep_map, 0, 0, _RET_IP_); + + sock_put(sk); + } + + /* we are still under the listener msk socket lock */ + lock_sock_nested(listener_ssk, SINGLE_DEPTH_NESTING); +} + static int subflow_ulp_init(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); From 63740448a32eb662e05894425b47bcc5814136f4 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 17 Apr 2023 16:00:41 +0200 Subject: [PATCH 161/175] mptcp: fix accept vs worker race The mptcp worker and mptcp_accept() can race, as reported by Christoph: refcount_t: addition on 0; use-after-free. WARNING: CPU: 1 PID: 14351 at lib/refcount.c:25 refcount_warn_saturate+0x105/0x1b0 lib/refcount.c:25 Modules linked in: CPU: 1 PID: 14351 Comm: syz-executor.2 Not tainted 6.3.0-rc1-gde5e8fd0123c #11 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-2.el7 04/01/2014 RIP: 0010:refcount_warn_saturate+0x105/0x1b0 lib/refcount.c:25 Code: 02 31 ff 89 de e8 1b f0 a7 ff 84 db 0f 85 6e ff ff ff e8 3e f5 a7 ff 48 c7 c7 d8 c7 34 83 c6 05 6d 2d 0f 02 01 e8 cb 3d 90 ff <0f> 0b e9 4f ff ff ff e8 1f f5 a7 ff 0f b6 1d 54 2d 0f 02 31 ff 89 RSP: 0018:ffffc90000a47bf8 EFLAGS: 00010282 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: ffff88802eae98c0 RSI: ffffffff81097d4f RDI: 0000000000000001 RBP: ffff88802e712180 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000001 R11: ffff88802eaea148 R12: ffff88802e712100 R13: ffff88802e712a88 R14: ffff888005cb93a8 R15: ffff88802e712a88 FS: 0000000000000000(0000) GS:ffff88803ed00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f277fd89120 CR3: 0000000035486002 CR4: 0000000000370ee0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __refcount_add include/linux/refcount.h:199 [inline] __refcount_inc include/linux/refcount.h:250 [inline] refcount_inc include/linux/refcount.h:267 [inline] sock_hold include/net/sock.h:775 [inline] __mptcp_close+0x4c6/0x4d0 net/mptcp/protocol.c:3051 mptcp_close+0x24/0xe0 net/mptcp/protocol.c:3072 inet_release+0x56/0xa0 net/ipv4/af_inet.c:429 __sock_release+0x51/0xf0 net/socket.c:653 sock_close+0x18/0x20 net/socket.c:1395 __fput+0x113/0x430 fs/file_table.c:321 task_work_run+0x96/0x100 kernel/task_work.c:179 exit_task_work include/linux/task_work.h:38 [inline] do_exit+0x4fc/0x10c0 kernel/exit.c:869 do_group_exit+0x51/0xf0 kernel/exit.c:1019 get_signal+0x12b0/0x1390 kernel/signal.c:2859 arch_do_signal_or_restart+0x25/0x260 arch/x86/kernel/signal.c:306 exit_to_user_mode_loop kernel/entry/common.c:168 [inline] exit_to_user_mode_prepare+0x131/0x1a0 kernel/entry/common.c:203 __syscall_exit_to_user_mode_work kernel/entry/common.c:285 [inline] syscall_exit_to_user_mode+0x19/0x40 kernel/entry/common.c:296 do_syscall_64+0x46/0x90 arch/x86/entry/common.c:86 entry_SYSCALL_64_after_hwframe+0x72/0xdc RIP: 0033:0x7fec4b4926a9 Code: Unable to access opcode bytes at 0x7fec4b49267f. RSP: 002b:00007fec49f9dd78 EFLAGS: 00000246 ORIG_RAX: 00000000000000ca RAX: fffffffffffffe00 RBX: 00000000006bc058 RCX: 00007fec4b4926a9 RDX: 0000000000000000 RSI: 0000000000000080 RDI: 00000000006bc058 RBP: 00000000006bc050 R08: 00000000007df998 R09: 00000000007df998 R10: 0000000000000000 R11: 0000000000000246 R12: 00000000006bc05c R13: fffffffffffffea8 R14: 000000000000000b R15: 000000000001fe40 The root cause is that the worker can force fallback to TCP the first mptcp subflow, actually deleting the unaccepted msk socket. We can explicitly prevent the race delaying the unaccepted msk deletion at listener shutdown time. In case the closed subflow is later accepted, just drop the mptcp context and let the user-space deal with the paired mptcp socket. Fixes: b6985b9b8295 ("mptcp: use the workqueue to destroy unaccepted sockets") Cc: stable@vger.kernel.org Reported-by: Christoph Paasch Link: https://github.com/multipath-tcp/mptcp_net-next/issues/375 Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts Tested-by: Christoph Paasch Signed-off-by: Matthieu Baerts Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 68 +++++++++++++++++++++++++++++--------------- net/mptcp/protocol.h | 1 + net/mptcp/subflow.c | 22 +++++++------- 3 files changed, 58 insertions(+), 33 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 5181fb91595b..b998e9df53ce 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2315,7 +2315,26 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, unsigned int flags) { struct mptcp_sock *msk = mptcp_sk(sk); - bool need_push, dispose_it; + bool dispose_it, need_push = false; + + /* If the first subflow moved to a close state before accept, e.g. due + * to an incoming reset, mptcp either: + * - if either the subflow or the msk are dead, destroy the context + * (the subflow socket is deleted by inet_child_forget) and the msk + * - otherwise do nothing at the moment and take action at accept and/or + * listener shutdown - user-space must be able to accept() the closed + * socket. + */ + if (msk->in_accept_queue && msk->first == ssk) { + if (!sock_flag(sk, SOCK_DEAD) && !sock_flag(ssk, SOCK_DEAD)) + return; + + /* ensure later check in mptcp_worker() will dispose the msk */ + sock_set_flag(sk, SOCK_DEAD); + lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); + mptcp_subflow_drop_ctx(ssk); + goto out_release; + } dispose_it = !msk->subflow || ssk != msk->subflow->sk; if (dispose_it) @@ -2351,18 +2370,6 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, if (!inet_csk(ssk)->icsk_ulp_ops) { WARN_ON_ONCE(!sock_flag(ssk, SOCK_DEAD)); kfree_rcu(subflow, rcu); - } else if (msk->in_accept_queue && msk->first == ssk) { - /* if the first subflow moved to a close state, e.g. due to - * incoming reset and we reach here before inet_child_forget() - * the TCP stack could later try to close it via - * inet_csk_listen_stop(), or deliver it to the user space via - * accept(). - * We can't delete the subflow - or risk a double free - nor let - * the msk survive - or will be leaked in the non accept scenario: - * fallback and let TCP cope with the subflow cleanup. - */ - WARN_ON_ONCE(sock_flag(ssk, SOCK_DEAD)); - mptcp_subflow_drop_ctx(ssk); } else { /* otherwise tcp will dispose of the ssk and subflow ctx */ if (ssk->sk_state == TCP_LISTEN) { @@ -2377,6 +2384,8 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, /* close acquired an extra ref */ __sock_put(ssk); } + +out_release: release_sock(ssk); sock_put(ssk); @@ -2431,21 +2440,14 @@ static void __mptcp_close_subflow(struct sock *sk) mptcp_close_ssk(sk, ssk, subflow); } - /* if the MPC subflow has been closed before the msk is accepted, - * msk will never be accept-ed, close it now - */ - if (!msk->first && msk->in_accept_queue) { - sock_set_flag(sk, SOCK_DEAD); - inet_sk_state_store(sk, TCP_CLOSE); - } } -static bool mptcp_check_close_timeout(const struct sock *sk) +static bool mptcp_should_close(const struct sock *sk) { s32 delta = tcp_jiffies32 - inet_csk(sk)->icsk_mtup.probe_timestamp; struct mptcp_subflow_context *subflow; - if (delta >= TCP_TIMEWAIT_LEN) + if (delta >= TCP_TIMEWAIT_LEN || mptcp_sk(sk)->in_accept_queue) return true; /* if all subflows are in closed status don't bother with additional @@ -2653,7 +2655,7 @@ static void mptcp_worker(struct work_struct *work) * even if it is orphaned and in FIN_WAIT2 state */ if (sock_flag(sk, SOCK_DEAD)) { - if (mptcp_check_close_timeout(sk)) { + if (mptcp_should_close(sk)) { inet_sk_state_store(sk, TCP_CLOSE); mptcp_do_fastclose(sk); } @@ -2899,6 +2901,14 @@ static void __mptcp_destroy_sock(struct sock *sk) sock_put(sk); } +void __mptcp_unaccepted_force_close(struct sock *sk) +{ + sock_set_flag(sk, SOCK_DEAD); + inet_sk_state_store(sk, TCP_CLOSE); + mptcp_do_fastclose(sk); + __mptcp_destroy_sock(sk); +} + static __poll_t mptcp_check_readable(struct mptcp_sock *msk) { /* Concurrent splices from sk_receive_queue into receive_queue will @@ -3737,6 +3747,18 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, if (!ssk->sk_socket) mptcp_sock_graft(ssk, newsock); } + + /* Do late cleanup for the first subflow as necessary. Also + * deal with bad peers not doing a complete shutdown. + */ + if (msk->first && + unlikely(inet_sk_state_load(msk->first) == TCP_CLOSE)) { + __mptcp_close_ssk(newsk, msk->first, + mptcp_subflow_ctx(msk->first), 0); + if (unlikely(list_empty(&msk->conn_list))) + inet_sk_state_store(newsk, TCP_CLOSE); + } + release_sock(newsk); } diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 3a2db1b862dd..d6469b6ab38e 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -634,6 +634,7 @@ void mptcp_sock_graft(struct sock *sk, struct socket *parent); struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk); bool __mptcp_close(struct sock *sk, long timeout); void mptcp_cancel_work(struct sock *sk); +void __mptcp_unaccepted_force_close(struct sock *sk); void mptcp_set_owner_r(struct sk_buff *skb, struct sock *sk); bool mptcp_addresses_equal(const struct mptcp_addr_info *a, diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index bf5e5c72b5ee..281c1cc8dc8d 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -723,9 +723,12 @@ void mptcp_subflow_drop_ctx(struct sock *ssk) if (!ctx) return; - subflow_ulp_fallback(ssk, ctx); - if (ctx->conn) - sock_put(ctx->conn); + list_del(&mptcp_subflow_ctx(ssk)->node); + if (inet_csk(ssk)->icsk_ulp_ops) { + subflow_ulp_fallback(ssk, ctx); + if (ctx->conn) + sock_put(ctx->conn); + } kfree_rcu(ctx, rcu); } @@ -1824,6 +1827,7 @@ void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_s struct request_sock_queue *queue = &inet_csk(listener_ssk)->icsk_accept_queue; struct mptcp_sock *msk, *next, *head = NULL; struct request_sock *req; + struct sock *sk; /* build a list of all unaccepted mptcp sockets */ spin_lock_bh(&queue->rskq_lock); @@ -1839,11 +1843,12 @@ void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_s continue; /* skip if already in list */ - msk = mptcp_sk(subflow->conn); + sk = subflow->conn; + msk = mptcp_sk(sk); if (msk->dl_next || msk == head) continue; - sock_hold(subflow->conn); + sock_hold(sk); msk->dl_next = head; head = msk; } @@ -1857,16 +1862,13 @@ void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_s release_sock(listener_ssk); for (msk = head; msk; msk = next) { - struct sock *sk = (struct sock *)msk; + sk = (struct sock *)msk; lock_sock_nested(sk, SINGLE_DEPTH_NESTING); next = msk->dl_next; msk->dl_next = NULL; - /* prevent the stack from later re-schedule the worker for - * this socket - */ - inet_sk_state_store(sk, TCP_CLOSE); + __mptcp_unaccepted_force_close(sk); release_sock(sk); /* lockdep will report a false positive ABBA deadlock From 1f64757ee2bb22a93ec89b4c71707297e8cca0ba Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 17 Apr 2023 18:52:51 +0200 Subject: [PATCH 162/175] mlxsw: pci: Fix possible crash during initialization During initialization the driver issues a reset command via its command interface in order to remove previous configuration from the device. After issuing the reset, the driver waits for 200ms before polling on the "system_status" register using memory-mapped IO until the device reaches a ready state (0x5E). The wait is necessary because the reset command only triggers the reset, but the reset itself happens asynchronously. If the driver starts polling too soon, the read of the "system_status" register will never return and the system will crash [1]. The issue was discovered when the device was flashed with a development firmware version where the reset routine took longer to complete. The issue was fixed in the firmware, but it exposed the fact that the current wait time is borderline. Fix by increasing the wait time from 200ms to 400ms. With this patch and the buggy firmware version, the issue did not reproduce in 10 reboots whereas without the patch the issue is reproduced quite consistently. [1] mce: CPUs not responding to MCE broadcast (may include false positives): 0,4 mce: CPUs not responding to MCE broadcast (may include false positives): 0,4 Kernel panic - not syncing: Timeout: Not all CPUs entered broadcast exception handler Shutting down cpus with NMI Kernel Offset: 0x12000000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) Fixes: ac004e84164e ("mlxsw: pci: Wait longer before accessing the device after reset") Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Signed-off-by: Petr Machata Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/pci_hw.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci_hw.h b/drivers/net/ethernet/mellanox/mlxsw/pci_hw.h index 48dbfea0a2a1..7cdf0ce24f28 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/pci_hw.h +++ b/drivers/net/ethernet/mellanox/mlxsw/pci_hw.h @@ -26,7 +26,7 @@ #define MLXSW_PCI_CIR_TIMEOUT_MSECS 1000 #define MLXSW_PCI_SW_RESET_TIMEOUT_MSECS 900000 -#define MLXSW_PCI_SW_RESET_WAIT_MSECS 200 +#define MLXSW_PCI_SW_RESET_WAIT_MSECS 400 #define MLXSW_PCI_FW_READY 0xA1844 #define MLXSW_PCI_FW_READY_MASK 0xFFFF #define MLXSW_PCI_FW_READY_MAGIC 0x5E From fcd4843a19d50f9e59116b2643e1a7d171b6fca1 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 17 Apr 2023 22:50:55 +0200 Subject: [PATCH 163/175] hamradio: drop ISA_DMA_API dependency It looks like the dependency got added accidentally in commit a553260618d8 ("[PATCH] ISA DMA Kconfig fixes - part 3"). Unlike the previously removed dmascc driver, the scc driver never used DMA. Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- drivers/net/hamradio/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/hamradio/Kconfig b/drivers/net/hamradio/Kconfig index a9c44f08199d..a94c7bd5db2e 100644 --- a/drivers/net/hamradio/Kconfig +++ b/drivers/net/hamradio/Kconfig @@ -47,7 +47,7 @@ config BPQETHER config SCC tristate "Z8530 SCC driver" - depends on ISA && AX25 && ISA_DMA_API + depends on ISA && AX25 help These cards are used to connect your Linux box to an amateur radio in order to communicate with other computers. If you want to use From 359f5b0d4e26b7a7bcc574d6148b31a17cefe47d Mon Sep 17 00:00:00 2001 From: Li Lanzhe Date: Wed, 19 Apr 2023 07:50:29 -0400 Subject: [PATCH 164/175] spi: spi-rockchip: Fix missing unwind goto in rockchip_sfc_probe() If devm_request_irq() fails, then we are directly return 'ret' without clk_disable_unprepare(sfc->clk) and clk_disable_unprepare(sfc->hclk). Fix this by changing direct return to a goto 'err_irq'. Fixes: 0b89fc0a367e ("spi: rockchip-sfc: add rockchip serial flash controller") Signed-off-by: Li Lanzhe Reviewed-by: Dongliang Mu Link: https://lore.kernel.org/r/20230419115030.6029-1-u202212060@hust.edu.cn Signed-off-by: Mark Brown --- drivers/spi/spi-rockchip-sfc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/spi/spi-rockchip-sfc.c b/drivers/spi/spi-rockchip-sfc.c index bd87d3c92dd3..69347b6bf60c 100644 --- a/drivers/spi/spi-rockchip-sfc.c +++ b/drivers/spi/spi-rockchip-sfc.c @@ -632,7 +632,7 @@ static int rockchip_sfc_probe(struct platform_device *pdev) if (ret) { dev_err(dev, "Failed to request irq\n"); - return ret; + goto err_irq; } ret = rockchip_sfc_init(sfc); From 71b547f561247897a0a14f3082730156c0533fed Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 11 Apr 2023 15:24:13 +0000 Subject: [PATCH 165/175] bpf: Fix incorrect verifier pruning due to missing register precision taints Juan Jose et al reported an issue found via fuzzing where the verifier's pruning logic prematurely marks a program path as safe. Consider the following program: 0: (b7) r6 = 1024 1: (b7) r7 = 0 2: (b7) r8 = 0 3: (b7) r9 = -2147483648 4: (97) r6 %= 1025 5: (05) goto pc+0 6: (bd) if r6 <= r9 goto pc+2 7: (97) r6 %= 1 8: (b7) r9 = 0 9: (bd) if r6 <= r9 goto pc+1 10: (b7) r6 = 0 11: (b7) r0 = 0 12: (63) *(u32 *)(r10 -4) = r0 13: (18) r4 = 0xffff888103693400 // map_ptr(ks=4,vs=48) 15: (bf) r1 = r4 16: (bf) r2 = r10 17: (07) r2 += -4 18: (85) call bpf_map_lookup_elem#1 19: (55) if r0 != 0x0 goto pc+1 20: (95) exit 21: (77) r6 >>= 10 22: (27) r6 *= 8192 23: (bf) r1 = r0 24: (0f) r0 += r6 25: (79) r3 = *(u64 *)(r0 +0) 26: (7b) *(u64 *)(r1 +0) = r3 27: (95) exit The verifier treats this as safe, leading to oob read/write access due to an incorrect verifier conclusion: func#0 @0 0: R1=ctx(off=0,imm=0) R10=fp0 0: (b7) r6 = 1024 ; R6_w=1024 1: (b7) r7 = 0 ; R7_w=0 2: (b7) r8 = 0 ; R8_w=0 3: (b7) r9 = -2147483648 ; R9_w=-2147483648 4: (97) r6 %= 1025 ; R6_w=scalar() 5: (05) goto pc+0 6: (bd) if r6 <= r9 goto pc+2 ; R6_w=scalar(umin=18446744071562067969,var_off=(0xffffffff00000000; 0xffffffff)) R9_w=-2147483648 7: (97) r6 %= 1 ; R6_w=scalar() 8: (b7) r9 = 0 ; R9=0 9: (bd) if r6 <= r9 goto pc+1 ; R6=scalar(umin=1) R9=0 10: (b7) r6 = 0 ; R6_w=0 11: (b7) r0 = 0 ; R0_w=0 12: (63) *(u32 *)(r10 -4) = r0 last_idx 12 first_idx 9 regs=1 stack=0 before 11: (b7) r0 = 0 13: R0_w=0 R10=fp0 fp-8=0000???? 13: (18) r4 = 0xffff8ad3886c2a00 ; R4_w=map_ptr(off=0,ks=4,vs=48,imm=0) 15: (bf) r1 = r4 ; R1_w=map_ptr(off=0,ks=4,vs=48,imm=0) R4_w=map_ptr(off=0,ks=4,vs=48,imm=0) 16: (bf) r2 = r10 ; R2_w=fp0 R10=fp0 17: (07) r2 += -4 ; R2_w=fp-4 18: (85) call bpf_map_lookup_elem#1 ; R0=map_value_or_null(id=1,off=0,ks=4,vs=48,imm=0) 19: (55) if r0 != 0x0 goto pc+1 ; R0=0 20: (95) exit from 19 to 21: R0=map_value(off=0,ks=4,vs=48,imm=0) R6=0 R7=0 R8=0 R9=0 R10=fp0 fp-8=mmmm???? 21: (77) r6 >>= 10 ; R6_w=0 22: (27) r6 *= 8192 ; R6_w=0 23: (bf) r1 = r0 ; R0=map_value(off=0,ks=4,vs=48,imm=0) R1_w=map_value(off=0,ks=4,vs=48,imm=0) 24: (0f) r0 += r6 last_idx 24 first_idx 19 regs=40 stack=0 before 23: (bf) r1 = r0 regs=40 stack=0 before 22: (27) r6 *= 8192 regs=40 stack=0 before 21: (77) r6 >>= 10 regs=40 stack=0 before 19: (55) if r0 != 0x0 goto pc+1 parent didn't have regs=40 stack=0 marks: R0_rw=map_value_or_null(id=1,off=0,ks=4,vs=48,imm=0) R6_rw=P0 R7=0 R8=0 R9=0 R10=fp0 fp-8=mmmm???? last_idx 18 first_idx 9 regs=40 stack=0 before 18: (85) call bpf_map_lookup_elem#1 regs=40 stack=0 before 17: (07) r2 += -4 regs=40 stack=0 before 16: (bf) r2 = r10 regs=40 stack=0 before 15: (bf) r1 = r4 regs=40 stack=0 before 13: (18) r4 = 0xffff8ad3886c2a00 regs=40 stack=0 before 12: (63) *(u32 *)(r10 -4) = r0 regs=40 stack=0 before 11: (b7) r0 = 0 regs=40 stack=0 before 10: (b7) r6 = 0 25: (79) r3 = *(u64 *)(r0 +0) ; R0_w=map_value(off=0,ks=4,vs=48,imm=0) R3_w=scalar() 26: (7b) *(u64 *)(r1 +0) = r3 ; R1_w=map_value(off=0,ks=4,vs=48,imm=0) R3_w=scalar() 27: (95) exit from 9 to 11: R1=ctx(off=0,imm=0) R6=0 R7=0 R8=0 R9=0 R10=fp0 11: (b7) r0 = 0 ; R0_w=0 12: (63) *(u32 *)(r10 -4) = r0 last_idx 12 first_idx 11 regs=1 stack=0 before 11: (b7) r0 = 0 13: R0_w=0 R10=fp0 fp-8=0000???? 13: (18) r4 = 0xffff8ad3886c2a00 ; R4_w=map_ptr(off=0,ks=4,vs=48,imm=0) 15: (bf) r1 = r4 ; R1_w=map_ptr(off=0,ks=4,vs=48,imm=0) R4_w=map_ptr(off=0,ks=4,vs=48,imm=0) 16: (bf) r2 = r10 ; R2_w=fp0 R10=fp0 17: (07) r2 += -4 ; R2_w=fp-4 18: (85) call bpf_map_lookup_elem#1 frame 0: propagating r6 last_idx 19 first_idx 11 regs=40 stack=0 before 18: (85) call bpf_map_lookup_elem#1 regs=40 stack=0 before 17: (07) r2 += -4 regs=40 stack=0 before 16: (bf) r2 = r10 regs=40 stack=0 before 15: (bf) r1 = r4 regs=40 stack=0 before 13: (18) r4 = 0xffff8ad3886c2a00 regs=40 stack=0 before 12: (63) *(u32 *)(r10 -4) = r0 regs=40 stack=0 before 11: (b7) r0 = 0 parent didn't have regs=40 stack=0 marks: R1=ctx(off=0,imm=0) R6_r=P0 R7=0 R8=0 R9=0 R10=fp0 last_idx 9 first_idx 9 regs=40 stack=0 before 9: (bd) if r6 <= r9 goto pc+1 parent didn't have regs=40 stack=0 marks: R1=ctx(off=0,imm=0) R6_rw=Pscalar() R7_w=0 R8_w=0 R9_rw=0 R10=fp0 last_idx 8 first_idx 0 regs=40 stack=0 before 8: (b7) r9 = 0 regs=40 stack=0 before 7: (97) r6 %= 1 regs=40 stack=0 before 6: (bd) if r6 <= r9 goto pc+2 regs=40 stack=0 before 5: (05) goto pc+0 regs=40 stack=0 before 4: (97) r6 %= 1025 regs=40 stack=0 before 3: (b7) r9 = -2147483648 regs=40 stack=0 before 2: (b7) r8 = 0 regs=40 stack=0 before 1: (b7) r7 = 0 regs=40 stack=0 before 0: (b7) r6 = 1024 19: safe frame 0: propagating r6 last_idx 9 first_idx 0 regs=40 stack=0 before 6: (bd) if r6 <= r9 goto pc+2 regs=40 stack=0 before 5: (05) goto pc+0 regs=40 stack=0 before 4: (97) r6 %= 1025 regs=40 stack=0 before 3: (b7) r9 = -2147483648 regs=40 stack=0 before 2: (b7) r8 = 0 regs=40 stack=0 before 1: (b7) r7 = 0 regs=40 stack=0 before 0: (b7) r6 = 1024 from 6 to 9: safe verification time 110 usec stack depth 4 processed 36 insns (limit 1000000) max_states_per_insn 0 total_states 3 peak_states 3 mark_read 2 The verifier considers this program as safe by mistakenly pruning unsafe code paths. In the above func#0, code lines 0-10 are of interest. In line 0-3 registers r6 to r9 are initialized with known scalar values. In line 4 the register r6 is reset to an unknown scalar given the verifier does not track modulo operations. Due to this, the verifier can also not determine precisely which branches in line 6 and 9 are taken, therefore it needs to explore them both. As can be seen, the verifier starts with exploring the false/fall-through paths first. The 'from 19 to 21' path has both r6=0 and r9=0 and the pointer arithmetic on r0 += r6 is therefore considered safe. Given the arithmetic, r6 is correctly marked for precision tracking where backtracking kicks in where it walks back the current path all the way where r6 was set to 0 in the fall-through branch. Next, the pruning logics pops the path 'from 9 to 11' from the stack. Also here, the state of the registers is the same, that is, r6=0 and r9=0, so that at line 19 the path can be pruned as it is considered safe. It is interesting to note that the conditional in line 9 turned r6 into a more precise state, that is, in the fall-through path at the beginning of line 10, it is R6=scalar(umin=1), and in the branch-taken path (which is analyzed here) at the beginning of line 11, r6 turned into a known const r6=0 as r9=0 prior to that and therefore (unsigned) r6 <= 0 concludes that r6 must be 0 (**): [...] ; R6_w=scalar() 9: (bd) if r6 <= r9 goto pc+1 ; R6=scalar(umin=1) R9=0 [...] from 9 to 11: R1=ctx(off=0,imm=0) R6=0 R7=0 R8=0 R9=0 R10=fp0 [...] The next path is 'from 6 to 9'. The verifier considers the old and current state equivalent, and therefore prunes the search incorrectly. Looking into the two states which are being compared by the pruning logic at line 9, the old state consists of R6_rwD=Pscalar() R9_rwD=0 R10=fp0 and the new state consists of R1=ctx(off=0,imm=0) R6_w=scalar(umax=18446744071562067968) R7_w=0 R8_w=0 R9_w=-2147483648 R10=fp0. While r6 had the reg->precise flag correctly set in the old state, r9 did not. Both r6'es are considered as equivalent given the old one is a superset of the current, more precise one, however, r9's actual values (0 vs 0x80000000) mismatch. Given the old r9 did not have reg->precise flag set, the verifier does not consider the register as contributing to the precision state of r6, and therefore it considered both r9 states as equivalent. However, for this specific pruned path (which is also the actual path taken at runtime), register r6 will be 0x400 and r9 0x80000000 when reaching line 21, thus oob-accessing the map. The purpose of precision tracking is to initially mark registers (including spilled ones) as imprecise to help verifier's pruning logic finding equivalent states it can then prune if they don't contribute to the program's safety aspects. For example, if registers are used for pointer arithmetic or to pass constant length to a helper, then the verifier sets reg->precise flag and backtracks the BPF program instruction sequence and chain of verifier states to ensure that the given register or stack slot including their dependencies are marked as precisely tracked scalar. This also includes any other registers and slots that contribute to a tracked state of given registers/stack slot. This backtracking relies on recorded jmp_history and is able to traverse entire chain of parent states. This process ends only when all the necessary registers/slots and their transitive dependencies are marked as precise. The backtrack_insn() is called from the current instruction up to the first instruction, and its purpose is to compute a bitmask of registers and stack slots that need precision tracking in the parent's verifier state. For example, if a current instruction is r6 = r7, then r6 needs precision after this instruction and r7 needs precision before this instruction, that is, in the parent state. Hence for the latter r7 is marked and r6 unmarked. For the class of jmp/jmp32 instructions, backtrack_insn() today only looks at call and exit instructions and for all other conditionals the masks remain as-is. However, in the given situation register r6 has a dependency on r9 (as described above in **), so also that one needs to be marked for precision tracking. In other words, if an imprecise register influences a precise one, then the imprecise register should also be marked precise. Meaning, in the parent state both dest and src register need to be tracked for precision and therefore the marking must be more conservative by setting reg->precise flag for both. The precision propagation needs to cover both for the conditional: if the src reg was marked but not the dst reg and vice versa. After the fix the program is correctly rejected: func#0 @0 0: R1=ctx(off=0,imm=0) R10=fp0 0: (b7) r6 = 1024 ; R6_w=1024 1: (b7) r7 = 0 ; R7_w=0 2: (b7) r8 = 0 ; R8_w=0 3: (b7) r9 = -2147483648 ; R9_w=-2147483648 4: (97) r6 %= 1025 ; R6_w=scalar() 5: (05) goto pc+0 6: (bd) if r6 <= r9 goto pc+2 ; R6_w=scalar(umin=18446744071562067969,var_off=(0xffffffff80000000; 0x7fffffff),u32_min=-2147483648) R9_w=-2147483648 7: (97) r6 %= 1 ; R6_w=scalar() 8: (b7) r9 = 0 ; R9=0 9: (bd) if r6 <= r9 goto pc+1 ; R6=scalar(umin=1) R9=0 10: (b7) r6 = 0 ; R6_w=0 11: (b7) r0 = 0 ; R0_w=0 12: (63) *(u32 *)(r10 -4) = r0 last_idx 12 first_idx 9 regs=1 stack=0 before 11: (b7) r0 = 0 13: R0_w=0 R10=fp0 fp-8=0000???? 13: (18) r4 = 0xffff9290dc5bfe00 ; R4_w=map_ptr(off=0,ks=4,vs=48,imm=0) 15: (bf) r1 = r4 ; R1_w=map_ptr(off=0,ks=4,vs=48,imm=0) R4_w=map_ptr(off=0,ks=4,vs=48,imm=0) 16: (bf) r2 = r10 ; R2_w=fp0 R10=fp0 17: (07) r2 += -4 ; R2_w=fp-4 18: (85) call bpf_map_lookup_elem#1 ; R0=map_value_or_null(id=1,off=0,ks=4,vs=48,imm=0) 19: (55) if r0 != 0x0 goto pc+1 ; R0=0 20: (95) exit from 19 to 21: R0=map_value(off=0,ks=4,vs=48,imm=0) R6=0 R7=0 R8=0 R9=0 R10=fp0 fp-8=mmmm???? 21: (77) r6 >>= 10 ; R6_w=0 22: (27) r6 *= 8192 ; R6_w=0 23: (bf) r1 = r0 ; R0=map_value(off=0,ks=4,vs=48,imm=0) R1_w=map_value(off=0,ks=4,vs=48,imm=0) 24: (0f) r0 += r6 last_idx 24 first_idx 19 regs=40 stack=0 before 23: (bf) r1 = r0 regs=40 stack=0 before 22: (27) r6 *= 8192 regs=40 stack=0 before 21: (77) r6 >>= 10 regs=40 stack=0 before 19: (55) if r0 != 0x0 goto pc+1 parent didn't have regs=40 stack=0 marks: R0_rw=map_value_or_null(id=1,off=0,ks=4,vs=48,imm=0) R6_rw=P0 R7=0 R8=0 R9=0 R10=fp0 fp-8=mmmm???? last_idx 18 first_idx 9 regs=40 stack=0 before 18: (85) call bpf_map_lookup_elem#1 regs=40 stack=0 before 17: (07) r2 += -4 regs=40 stack=0 before 16: (bf) r2 = r10 regs=40 stack=0 before 15: (bf) r1 = r4 regs=40 stack=0 before 13: (18) r4 = 0xffff9290dc5bfe00 regs=40 stack=0 before 12: (63) *(u32 *)(r10 -4) = r0 regs=40 stack=0 before 11: (b7) r0 = 0 regs=40 stack=0 before 10: (b7) r6 = 0 25: (79) r3 = *(u64 *)(r0 +0) ; R0_w=map_value(off=0,ks=4,vs=48,imm=0) R3_w=scalar() 26: (7b) *(u64 *)(r1 +0) = r3 ; R1_w=map_value(off=0,ks=4,vs=48,imm=0) R3_w=scalar() 27: (95) exit from 9 to 11: R1=ctx(off=0,imm=0) R6=0 R7=0 R8=0 R9=0 R10=fp0 11: (b7) r0 = 0 ; R0_w=0 12: (63) *(u32 *)(r10 -4) = r0 last_idx 12 first_idx 11 regs=1 stack=0 before 11: (b7) r0 = 0 13: R0_w=0 R10=fp0 fp-8=0000???? 13: (18) r4 = 0xffff9290dc5bfe00 ; R4_w=map_ptr(off=0,ks=4,vs=48,imm=0) 15: (bf) r1 = r4 ; R1_w=map_ptr(off=0,ks=4,vs=48,imm=0) R4_w=map_ptr(off=0,ks=4,vs=48,imm=0) 16: (bf) r2 = r10 ; R2_w=fp0 R10=fp0 17: (07) r2 += -4 ; R2_w=fp-4 18: (85) call bpf_map_lookup_elem#1 frame 0: propagating r6 last_idx 19 first_idx 11 regs=40 stack=0 before 18: (85) call bpf_map_lookup_elem#1 regs=40 stack=0 before 17: (07) r2 += -4 regs=40 stack=0 before 16: (bf) r2 = r10 regs=40 stack=0 before 15: (bf) r1 = r4 regs=40 stack=0 before 13: (18) r4 = 0xffff9290dc5bfe00 regs=40 stack=0 before 12: (63) *(u32 *)(r10 -4) = r0 regs=40 stack=0 before 11: (b7) r0 = 0 parent didn't have regs=40 stack=0 marks: R1=ctx(off=0,imm=0) R6_r=P0 R7=0 R8=0 R9=0 R10=fp0 last_idx 9 first_idx 9 regs=40 stack=0 before 9: (bd) if r6 <= r9 goto pc+1 parent didn't have regs=240 stack=0 marks: R1=ctx(off=0,imm=0) R6_rw=Pscalar() R7_w=0 R8_w=0 R9_rw=P0 R10=fp0 last_idx 8 first_idx 0 regs=240 stack=0 before 8: (b7) r9 = 0 regs=40 stack=0 before 7: (97) r6 %= 1 regs=40 stack=0 before 6: (bd) if r6 <= r9 goto pc+2 regs=240 stack=0 before 5: (05) goto pc+0 regs=240 stack=0 before 4: (97) r6 %= 1025 regs=240 stack=0 before 3: (b7) r9 = -2147483648 regs=40 stack=0 before 2: (b7) r8 = 0 regs=40 stack=0 before 1: (b7) r7 = 0 regs=40 stack=0 before 0: (b7) r6 = 1024 19: safe from 6 to 9: R1=ctx(off=0,imm=0) R6_w=scalar(umax=18446744071562067968) R7_w=0 R8_w=0 R9_w=-2147483648 R10=fp0 9: (bd) if r6 <= r9 goto pc+1 last_idx 9 first_idx 0 regs=40 stack=0 before 6: (bd) if r6 <= r9 goto pc+2 regs=240 stack=0 before 5: (05) goto pc+0 regs=240 stack=0 before 4: (97) r6 %= 1025 regs=240 stack=0 before 3: (b7) r9 = -2147483648 regs=40 stack=0 before 2: (b7) r8 = 0 regs=40 stack=0 before 1: (b7) r7 = 0 regs=40 stack=0 before 0: (b7) r6 = 1024 last_idx 9 first_idx 0 regs=200 stack=0 before 6: (bd) if r6 <= r9 goto pc+2 regs=240 stack=0 before 5: (05) goto pc+0 regs=240 stack=0 before 4: (97) r6 %= 1025 regs=240 stack=0 before 3: (b7) r9 = -2147483648 regs=40 stack=0 before 2: (b7) r8 = 0 regs=40 stack=0 before 1: (b7) r7 = 0 regs=40 stack=0 before 0: (b7) r6 = 1024 11: R6=scalar(umax=18446744071562067968) R9=-2147483648 11: (b7) r0 = 0 ; R0_w=0 12: (63) *(u32 *)(r10 -4) = r0 last_idx 12 first_idx 11 regs=1 stack=0 before 11: (b7) r0 = 0 13: R0_w=0 R10=fp0 fp-8=0000???? 13: (18) r4 = 0xffff9290dc5bfe00 ; R4_w=map_ptr(off=0,ks=4,vs=48,imm=0) 15: (bf) r1 = r4 ; R1_w=map_ptr(off=0,ks=4,vs=48,imm=0) R4_w=map_ptr(off=0,ks=4,vs=48,imm=0) 16: (bf) r2 = r10 ; R2_w=fp0 R10=fp0 17: (07) r2 += -4 ; R2_w=fp-4 18: (85) call bpf_map_lookup_elem#1 ; R0_w=map_value_or_null(id=3,off=0,ks=4,vs=48,imm=0) 19: (55) if r0 != 0x0 goto pc+1 ; R0_w=0 20: (95) exit from 19 to 21: R0=map_value(off=0,ks=4,vs=48,imm=0) R6=scalar(umax=18446744071562067968) R7=0 R8=0 R9=-2147483648 R10=fp0 fp-8=mmmm???? 21: (77) r6 >>= 10 ; R6_w=scalar(umax=18014398507384832,var_off=(0x0; 0x3fffffffffffff)) 22: (27) r6 *= 8192 ; R6_w=scalar(smax=9223372036854767616,umax=18446744073709543424,var_off=(0x0; 0xffffffffffffe000),s32_max=2147475456,u32_max=-8192) 23: (bf) r1 = r0 ; R0=map_value(off=0,ks=4,vs=48,imm=0) R1_w=map_value(off=0,ks=4,vs=48,imm=0) 24: (0f) r0 += r6 last_idx 24 first_idx 21 regs=40 stack=0 before 23: (bf) r1 = r0 regs=40 stack=0 before 22: (27) r6 *= 8192 regs=40 stack=0 before 21: (77) r6 >>= 10 parent didn't have regs=40 stack=0 marks: R0_rw=map_value(off=0,ks=4,vs=48,imm=0) R6_r=Pscalar(umax=18446744071562067968) R7=0 R8=0 R9=-2147483648 R10=fp0 fp-8=mmmm???? last_idx 19 first_idx 11 regs=40 stack=0 before 19: (55) if r0 != 0x0 goto pc+1 regs=40 stack=0 before 18: (85) call bpf_map_lookup_elem#1 regs=40 stack=0 before 17: (07) r2 += -4 regs=40 stack=0 before 16: (bf) r2 = r10 regs=40 stack=0 before 15: (bf) r1 = r4 regs=40 stack=0 before 13: (18) r4 = 0xffff9290dc5bfe00 regs=40 stack=0 before 12: (63) *(u32 *)(r10 -4) = r0 regs=40 stack=0 before 11: (b7) r0 = 0 parent didn't have regs=40 stack=0 marks: R1=ctx(off=0,imm=0) R6_rw=Pscalar(umax=18446744071562067968) R7_w=0 R8_w=0 R9_w=-2147483648 R10=fp0 last_idx 9 first_idx 0 regs=40 stack=0 before 9: (bd) if r6 <= r9 goto pc+1 regs=240 stack=0 before 6: (bd) if r6 <= r9 goto pc+2 regs=240 stack=0 before 5: (05) goto pc+0 regs=240 stack=0 before 4: (97) r6 %= 1025 regs=240 stack=0 before 3: (b7) r9 = -2147483648 regs=40 stack=0 before 2: (b7) r8 = 0 regs=40 stack=0 before 1: (b7) r7 = 0 regs=40 stack=0 before 0: (b7) r6 = 1024 math between map_value pointer and register with unbounded min value is not allowed verification time 886 usec stack depth 4 processed 49 insns (limit 1000000) max_states_per_insn 1 total_states 5 peak_states 5 mark_read 2 Fixes: b5dc0163d8fd ("bpf: precise scalar_value tracking") Reported-by: Juan Jose Lopez Jaimez Reported-by: Meador Inge Reported-by: Simon Scannell Reported-by: Nenad Stojanovski Signed-off-by: Daniel Borkmann Co-developed-by: Andrii Nakryiko Signed-off-by: Andrii Nakryiko Reviewed-by: John Fastabend Reviewed-by: Juan Jose Lopez Jaimez Reviewed-by: Meador Inge Reviewed-by: Simon Scannell --- kernel/bpf/verifier.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d517d13878cf..767e8930b0bd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2967,6 +2967,21 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, } } else if (opcode == BPF_EXIT) { return -ENOTSUPP; + } else if (BPF_SRC(insn->code) == BPF_X) { + if (!(*reg_mask & (dreg | sreg))) + return 0; + /* dreg sreg + * Both dreg and sreg need precision before + * this insn. If only sreg was marked precise + * before it would be equally necessary to + * propagate it to dreg. + */ + *reg_mask |= (sreg | dreg); + /* else dreg K + * Only dreg still needs precision before + * this insn, so for the K-based conditional + * there is nothing new to be marked. + */ } } else if (class == BPF_LD) { if (!(*reg_mask & dreg)) From ccc4505454db10402d5284f22d8b7db62e636fc5 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Fri, 10 Feb 2023 16:26:22 +0100 Subject: [PATCH 166/175] rust: fix regexp in scripts/is_rust_module.sh nm can use "R" or "r" to show read-only data sections, but scripts/is_rust_module.sh can only recognize "r", so with some versions of binutils it can fail to detect if a module is a Rust module or not. Right now we're using this script only to determine if we need to skip BTF generation (that is disabled globally if CONFIG_RUST is enabled), but it's still nice to fix this script to do the proper job. Moreover, with this patch applied I can also relax the constraint of "RUST depends on !DEBUG_INFO_BTF" and build a kernel with Rust and BTF enabled at the same time (of course BTF generation is still skipped for Rust modules). [ Miguel: The actual reason is likely to be a change on the Rust compiler between 1.61.0 and 1.62.0: echo '#[used] static S: () = ();' | rustup run 1.61.0 rustc --emit=obj --crate-type=lib - && nm rust_out.o echo '#[used] static S: () = ();' | rustup run 1.62.0 rustc --emit=obj --crate-type=lib - && nm rust_out.o Gives: 0000000000000000 r _ZN8rust_out1S17h48027ce0da975467E 0000000000000000 R _ZN8rust_out1S17h58e1f3d9c0e97cefE See https://godbolt.org/z/KE6jneoo4. ] Signed-off-by: Andrea Righi Reviewed-by: Vincenzo Palazzo Reviewed-by: Eric Curtin Reviewed-by: Martin Rodriguez Reboredo Signed-off-by: Miguel Ojeda --- scripts/is_rust_module.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/is_rust_module.sh b/scripts/is_rust_module.sh index 28b3831a7593..464761a7cf7f 100755 --- a/scripts/is_rust_module.sh +++ b/scripts/is_rust_module.sh @@ -13,4 +13,4 @@ set -e # # In the future, checking for the `.comment` section may be another # option, see https://github.com/rust-lang/rust/pull/97550. -${NM} "$*" | grep -qE '^[0-9a-fA-F]+ r _R[^[:space:]]+16___IS_RUST_MODULE[^[:space:]]*$' +${NM} "$*" | grep -qE '^[0-9a-fA-F]+ [Rr] _R[^[:space:]]+16___IS_RUST_MODULE[^[:space:]]*$' From d966c3cab924fb750fefef11e77a6fa07dd5420e Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Fri, 10 Feb 2023 22:51:41 +0100 Subject: [PATCH 167/175] rust: allow to use INIT_STACK_ALL_ZERO With CONFIG_INIT_STACK_ALL_ZERO enabled, bindgen passes -ftrivial-auto-var-init=zero to clang, that triggers the following error: error: '-ftrivial-auto-var-init=zero' hasn't been enabled; enable it at your own peril for benchmarking purpose only with '-enable-trivial-auto-var-init-zero-knowing-it-will-be-removed-from-clang' However, this additional option that is currently required by clang is deprecated since clang-16 and going to be removed in the future, likely with clang-18. So, make sure bindgen is using this extra option if the major version of the libclang used by bindgen is < 16. In this way we can enable CONFIG_INIT_STACK_ALL_ZERO with CONFIG_RUST without triggering any build error. Link: https://github.com/llvm/llvm-project/issues/44842 Link: https://github.com/llvm/llvm-project/blob/llvmorg-16.0.0-rc2/clang/docs/ReleaseNotes.rst#deprecated-compiler-flags Signed-off-by: Andrea Righi Reviewed-by: Kees Cook [Changed to < 16, added link and reworded] Signed-off-by: Miguel Ojeda --- rust/Makefile | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/rust/Makefile b/rust/Makefile index 41ac8401a8be..aef85e9e8eeb 100644 --- a/rust/Makefile +++ b/rust/Makefile @@ -262,6 +262,20 @@ BINDGEN_TARGET := $(BINDGEN_TARGET_$(SRCARCH)) # some configurations, with new GCC versions, etc. bindgen_extra_c_flags = -w --target=$(BINDGEN_TARGET) +# Auto variable zero-initialization requires an additional special option with +# clang that is going to be removed sometime in the future (likely in +# clang-18), so make sure to pass this option only if clang supports it +# (libclang major version < 16). +# +# https://github.com/llvm/llvm-project/issues/44842 +# https://github.com/llvm/llvm-project/blob/llvmorg-16.0.0-rc2/clang/docs/ReleaseNotes.rst#deprecated-compiler-flags +ifdef CONFIG_INIT_STACK_ALL_ZERO +libclang_maj_ver=$(shell $(BINDGEN) $(srctree)/scripts/rust_is_available_bindgen_libclang.h 2>&1 | sed -ne 's/.*clang version \([0-9]*\).*/\1/p') +ifeq ($(shell expr $(libclang_maj_ver) \< 16), 1) +bindgen_extra_c_flags += -enable-trivial-auto-var-init-zero-knowing-it-will-be-removed-from-clang +endif +endif + bindgen_c_flags = $(filter-out $(bindgen_skip_c_flags), $(c_flags)) \ $(bindgen_extra_c_flags) endif From 3d2f8f1f184c60508f7af3022536651d7ac2dd07 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 17 Apr 2023 20:19:33 +0200 Subject: [PATCH 168/175] net: dsa: microchip: ksz8795: Correctly handle huge frame configuration Because of the logic in place, SW_HUGE_PACKET can never be set. (If the first condition is true, then the 2nd one is also true, but is not executed) Change the logic and update each bit individually. Fixes: 29d1e85f45e0 ("net: dsa: microchip: ksz8: add MTU configuration support") Signed-off-by: Christophe JAILLET Reviewed-by: Oleksij Rempel Reviewed-by: Simon Horman Reviewed-by: Vladimir Oltean Reviewed-by: Florian Fainelli Link: https://lore.kernel.org/r/43107d9e8b5b8b05f0cbd4e1f47a2bb88c8747b2.1681755535.git.christophe.jaillet@wanadoo.fr Signed-off-by: Jakub Kicinski --- drivers/net/dsa/microchip/ksz8795.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/microchip/ksz8795.c b/drivers/net/dsa/microchip/ksz8795.c index 3fffd5da8d3b..ffcad057d065 100644 --- a/drivers/net/dsa/microchip/ksz8795.c +++ b/drivers/net/dsa/microchip/ksz8795.c @@ -96,7 +96,7 @@ static int ksz8795_change_mtu(struct ksz_device *dev, int frame_size) if (frame_size > KSZ8_LEGAL_PACKET_SIZE) ctrl2 |= SW_LEGAL_PACKET_DISABLE; - else if (frame_size > KSZ8863_NORMAL_PACKET_SIZE) + if (frame_size > KSZ8863_NORMAL_PACKET_SIZE) ctrl1 |= SW_HUGE_PACKET; ret = ksz_rmw8(dev, REG_SW_CTRL_1, SW_HUGE_PACKET, ctrl1); From 8c154d272c3e03b100baaf1df473f22a78fa403e Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Tue, 18 Apr 2023 13:25:11 -0700 Subject: [PATCH 169/175] bnxt_en: fix free-runnig PHC mode The patch in fixes changed the way real-time mode is chosen for PHC on the NIC. Apparently there is one more use case of the check outside of ptp part of the driver which was not converted to the new macro and is making a lot of noise in free-running mode. Fixes: 131db4991622 ("bnxt_en: reset PHC frequency in free-running mode") Signed-off-by: Vadim Fedorenko Reviewed-by: Michael Chan Reviewed-by: Pavan Chebbi Link: https://lore.kernel.org/r/20230418202511.1544735-1-vadfed@meta.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index ef97a4190b39..651b79ce5d80 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -2388,7 +2388,7 @@ static int bnxt_async_event_process(struct bnxt *bp, case ASYNC_EVENT_CMPL_EVENT_ID_PHC_UPDATE: { switch (BNXT_EVENT_PHC_EVENT_TYPE(data1)) { case ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_DATA1_FLAGS_PHC_RTC_UPDATE: - if (bp->fw_cap & BNXT_FW_CAP_PTP_RTC) { + if (BNXT_PTP_USE_RTC(bp)) { struct bnxt_ptp_cfg *ptp = bp->ptp_cfg; u64 ns; From 67d47b95119ad589b0a0b16b88b1dd9a04061ced Mon Sep 17 00:00:00 2001 From: Sebastian Basierski Date: Mon, 17 Apr 2023 13:53:45 -0700 Subject: [PATCH 170/175] e1000e: Disable TSO on i219-LM card to increase speed While using i219-LM card currently it was only possible to achieve about 60% of maximum speed due to regression introduced in Linux 5.8. This was caused by TSO not being disabled by default despite commit f29801030ac6 ("e1000e: Disable TSO for buffer overrun workaround"). Fix that by disabling TSO during driver probe. Fixes: f29801030ac6 ("e1000e: Disable TSO for buffer overrun workaround") Signed-off-by: Sebastian Basierski Signed-off-by: Mateusz Palczewski Tested-by: Naama Meir Signed-off-by: Tony Nguyen Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230417205345.1030801-1-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/e1000e/netdev.c | 51 +++++++++++----------- 1 file changed, 26 insertions(+), 25 deletions(-) diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c index e1eb1de88bf9..e14d1e45318f 100644 --- a/drivers/net/ethernet/intel/e1000e/netdev.c +++ b/drivers/net/ethernet/intel/e1000e/netdev.c @@ -5288,31 +5288,6 @@ static void e1000_watchdog_task(struct work_struct *work) ew32(TARC(0), tarc0); } - /* disable TSO for pcie and 10/100 speeds, to avoid - * some hardware issues - */ - if (!(adapter->flags & FLAG_TSO_FORCE)) { - switch (adapter->link_speed) { - case SPEED_10: - case SPEED_100: - e_info("10/100 speed: disabling TSO\n"); - netdev->features &= ~NETIF_F_TSO; - netdev->features &= ~NETIF_F_TSO6; - break; - case SPEED_1000: - netdev->features |= NETIF_F_TSO; - netdev->features |= NETIF_F_TSO6; - break; - default: - /* oops */ - break; - } - if (hw->mac.type == e1000_pch_spt) { - netdev->features &= ~NETIF_F_TSO; - netdev->features &= ~NETIF_F_TSO6; - } - } - /* enable transmits in the hardware, need to do this * after setting TARC(0) */ @@ -7526,6 +7501,32 @@ static int e1000_probe(struct pci_dev *pdev, const struct pci_device_id *ent) NETIF_F_RXCSUM | NETIF_F_HW_CSUM); + /* disable TSO for pcie and 10/100 speeds to avoid + * some hardware issues and for i219 to fix transfer + * speed being capped at 60% + */ + if (!(adapter->flags & FLAG_TSO_FORCE)) { + switch (adapter->link_speed) { + case SPEED_10: + case SPEED_100: + e_info("10/100 speed: disabling TSO\n"); + netdev->features &= ~NETIF_F_TSO; + netdev->features &= ~NETIF_F_TSO6; + break; + case SPEED_1000: + netdev->features |= NETIF_F_TSO; + netdev->features |= NETIF_F_TSO6; + break; + default: + /* oops */ + break; + } + if (hw->mac.type == e1000_pch_spt) { + netdev->features &= ~NETIF_F_TSO; + netdev->features &= ~NETIF_F_TSO6; + } + } + /* Set user-changeable features (subset of all device features) */ netdev->hw_features = netdev->features; netdev->hw_features |= NETIF_F_RXFCS; From 7b3aba7ea336d069b30b91502d47792c280bae2b Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Tue, 18 Apr 2023 10:36:59 +0200 Subject: [PATCH 171/175] mailmap: add entries for Mat Martineau Map Mat's old corporate addresses to his kernel.org one. Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts Link: https://lore.kernel.org/r/20230418-upstream-net-20230418-mailmap-mat-v1-1-13ca5dc83037@tessares.net Signed-off-by: Jakub Kicinski --- .mailmap | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.mailmap b/.mailmap index e42486317d18..69dae9b6ec95 100644 --- a/.mailmap +++ b/.mailmap @@ -297,6 +297,8 @@ Martin Kepplinger Martin Kepplinger Martyna Szapar-Mudlaw Mathieu Othacehe +Mat Martineau +Mat Martineau Matthew Wilcox Matthew Wilcox Matthew Wilcox From 52b37ae8aa6797a8183e8554672797045e81d9ae Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Tue, 18 Apr 2023 16:13:18 -0700 Subject: [PATCH 172/175] MAINTAINERS: Resume MPTCP co-maintainer role I'm returning to the MPTCP maintainer role I held for most of the subsytem's history. This time I'm using my kernel.org email address. Acked-by: Matthieu Baerts Link: https://lore.kernel.org/mptcp/af85e467-8d0a-4eba-b5f8-e2f2c5d24984@tessares.net/ Signed-off-by: Mat Martineau Link: https://lore.kernel.org/r/20230418231318.115331-1-martineau@kernel.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 0e64787aace8..c6545eb54104 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14594,6 +14594,7 @@ F: net/netlabel/ NETWORKING [MPTCP] M: Matthieu Baerts +M: Mat Martineau L: netdev@vger.kernel.org L: mptcp@lists.linux.dev S: Maintained From f52cc627b832e08a7bcf1b7e81e650ec308fe1d8 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 13 Apr 2023 15:25:47 -0700 Subject: [PATCH 173/175] Revert "net/mlx5: Enable management PF initialization" This reverts commit fe998a3c77b9f989a30a2a01fb00d3729a6d53a4. Paul reports that it causes a regression with IB on CX4 and FW 12.18.1000. In addition I think that the concept of "management PF" is not fully accepted and requires a discussion. Fixes: fe998a3c77b9 ("net/mlx5: Enable management PF initialization") Reported-by: Paul Moore Link: https://lore.kernel.org/all/CAHC9VhQ7A4+msL38WpbOMYjAqLp0EtOjeLh4Dc6SQtD6OUvCQg@mail.gmail.com/ Link: https://lore.kernel.org/r/20230413222547.56901-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/dev.c | 6 ------ drivers/net/ethernet/mellanox/mlx5/core/ecpf.c | 8 -------- drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 2 +- include/linux/mlx5/driver.h | 5 ----- 4 files changed, 1 insertion(+), 20 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/dev.c b/drivers/net/ethernet/mellanox/mlx5/core/dev.c index 445fe30c3d0b..2e7806001fdc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/dev.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/dev.c @@ -59,9 +59,6 @@ bool mlx5_eth_supported(struct mlx5_core_dev *dev) if (!IS_ENABLED(CONFIG_MLX5_CORE_EN)) return false; - if (mlx5_core_is_management_pf(dev)) - return false; - if (MLX5_CAP_GEN(dev, port_type) != MLX5_CAP_PORT_TYPE_ETH) return false; @@ -201,9 +198,6 @@ bool mlx5_rdma_supported(struct mlx5_core_dev *dev) if (!IS_ENABLED(CONFIG_MLX5_INFINIBAND)) return false; - if (mlx5_core_is_management_pf(dev)) - return false; - if (dev->priv.flags & MLX5_PRIV_FLAGS_DISABLE_IB_ADEV) return false; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c b/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c index 7c9c4e40c019..d000236ddbac 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c @@ -75,10 +75,6 @@ int mlx5_ec_init(struct mlx5_core_dev *dev) if (!mlx5_core_is_ecpf(dev)) return 0; - /* Management PF don't have a peer PF */ - if (mlx5_core_is_management_pf(dev)) - return 0; - return mlx5_host_pf_init(dev); } @@ -89,10 +85,6 @@ void mlx5_ec_cleanup(struct mlx5_core_dev *dev) if (!mlx5_core_is_ecpf(dev)) return; - /* Management PF don't have a peer PF */ - if (mlx5_core_is_management_pf(dev)) - return; - mlx5_host_pf_cleanup(dev); err = mlx5_wait_for_pages(dev, &dev->priv.page_counters[MLX5_HOST_PF]); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c index 8bdf28762f41..19fed514fc17 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c @@ -1488,7 +1488,7 @@ int mlx5_esw_sf_max_hpf_functions(struct mlx5_core_dev *dev, u16 *max_sfs, u16 * void *hca_caps; int err; - if (!mlx5_core_is_ecpf(dev) || mlx5_core_is_management_pf(dev)) { + if (!mlx5_core_is_ecpf(dev)) { *max_sfs = 0; return 0; } diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index f33389b42209..7e225e41d55b 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1211,11 +1211,6 @@ static inline bool mlx5_core_is_vf(const struct mlx5_core_dev *dev) return dev->coredev_type == MLX5_COREDEV_VF; } -static inline bool mlx5_core_is_management_pf(const struct mlx5_core_dev *dev) -{ - return MLX5_CAP_GEN(dev, num_ports) == 1 && !MLX5_CAP_GEN(dev, native_port_num); -} - static inline bool mlx5_core_is_ecpf(const struct mlx5_core_dev *dev) { return dev->caps.embedded_cpu; From 927cdea5d2095287ddd5246e5aa68eb5d68db2be Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 18 Apr 2023 18:59:02 +0300 Subject: [PATCH 174/175] net: bridge: switchdev: don't notify FDB entries with "master dynamic" There is a structural problem in switchdev, where the flag bits in struct switchdev_notifier_fdb_info (added_by_user, is_local etc) only represent a simplified / denatured view of what's in struct net_bridge_fdb_entry :: flags (BR_FDB_ADDED_BY_USER, BR_FDB_LOCAL etc). Each time we want to pass more information about struct net_bridge_fdb_entry :: flags to struct switchdev_notifier_fdb_info (here, BR_FDB_STATIC), we find that FDB entries were already notified to switchdev with no regard to this flag, and thus, switchdev drivers had no indication whether the notified entries were static or not. For example, this command: ip link add br0 type bridge && ip link set swp0 master br0 bridge fdb add dev swp0 00:01:02:03:04:05 master dynamic has never worked as intended with switchdev. It causes a struct net_bridge_fdb_entry to be passed to br_switchdev_fdb_notify() which has a single flag set: BR_FDB_ADDED_BY_USER. This is further passed to the switchdev notifier chain, where interested drivers have no choice but to assume this is a static (does not age) and sticky (does not migrate) FDB entry. So currently, all drivers offload it to hardware as such, as can be seen below ("offload" is set). bridge fdb get 00:01:02:03:04:05 dev swp0 master 00:01:02:03:04:05 dev swp0 offload master br0 The software FDB entry expires $ageing_time centiseconds after the kernel last sees a packet with this MAC SA, and the bridge notifies its deletion as well, so it eventually disappears from hardware too. This is a problem, because it is actually desirable to start offloading "master dynamic" FDB entries correctly - they should expire $ageing_time centiseconds after the *hardware* port last sees a packet with this MAC SA - and this is how the current incorrect behavior was discovered. With an offloaded data plane, it can be expected that software only sees exception path packets, so an otherwise active dynamic FDB entry would be aged out by software sooner than it should. With the change in place, these FDB entries are no longer offloaded: bridge fdb get 00:01:02:03:04:05 dev swp0 master 00:01:02:03:04:05 dev swp0 master br0 and this also constitutes a better way (assuming a backport to stable kernels) for user space to determine whether the kernel has the capability of doing something sane with these or not. As opposed to "master dynamic" FDB entries, on the current behavior of which no one currently depends on (which can be deduced from the lack of kselftests), Ido Schimmel explains that entries with the "extern_learn" flag (BR_FDB_ADDED_BY_EXT_LEARN) should still be notified to switchdev, since the spectrum driver listens to them (and this is kind of okay, because although they are treated identically to "static", they are expected to not age, and to roam). Fixes: 6b26b51b1d13 ("net: bridge: Add support for notifying devices about FDB add/del") Link: https://lore.kernel.org/netdev/20230327115206.jk5q5l753aoelwus@skbuf/ Signed-off-by: Vladimir Oltean Reviewed-by: Jesse Brandeburg Reviewed-by: Ido Schimmel Tested-by: Ido Schimmel Link: https://lore.kernel.org/r/20230418155902.898627-1-vladimir.oltean@nxp.com Signed-off-by: Paolo Abeni --- net/bridge/br_switchdev.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index de18e9c1d7a7..ba95c4d74a60 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -148,6 +148,17 @@ br_switchdev_fdb_notify(struct net_bridge *br, if (test_bit(BR_FDB_LOCKED, &fdb->flags)) return; + /* Entries with these flags were created using ndm_state == NUD_REACHABLE, + * ndm_flags == NTF_MASTER( | NTF_STICKY), ext_flags == 0 by something + * equivalent to 'bridge fdb add ... master dynamic (sticky)'. + * Drivers don't know how to deal with these, so don't notify them to + * avoid confusing them. + */ + if (test_bit(BR_FDB_ADDED_BY_USER, &fdb->flags) && + !test_bit(BR_FDB_STATIC, &fdb->flags) && + !test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags)) + return; + br_switchdev_fdb_populate(br, &item, fdb, NULL); switch (type) { From 0f2a4af27b649c13ba76431552fe49c60120d0f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Thu, 13 Apr 2023 23:41:18 +0200 Subject: [PATCH 175/175] wifi: ath9k: Don't mark channelmap stack variable read-only in ath9k_mci_update_wlan_channels() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This partially reverts commit e161d4b60ae3a5356e07202e0bfedb5fad82c6aa. Turns out the channelmap variable is not actually read-only, it's modified through the MCI_GPM_CLR_CHANNEL_BIT() macro further down in the function, so making it read-only causes page faults when that code is hit. Link: https://bugzilla.kernel.org/show_bug.cgi?id=217183 Link: https://lore.kernel.org/r/20230413214118.153781-1-toke@toke.dk Fixes: e161d4b60ae3 ("wifi: ath9k: Make arrays prof_prio and channelmap static const") Cc: stable@vger.kernel.org Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Linus Torvalds --- drivers/net/wireless/ath/ath9k/mci.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/wireless/ath/ath9k/mci.c b/drivers/net/wireless/ath/ath9k/mci.c index 3363fc4e8966..a0845002d6fe 100644 --- a/drivers/net/wireless/ath/ath9k/mci.c +++ b/drivers/net/wireless/ath/ath9k/mci.c @@ -646,9 +646,7 @@ void ath9k_mci_update_wlan_channels(struct ath_softc *sc, bool allow_all) struct ath_hw *ah = sc->sc_ah; struct ath9k_hw_mci *mci = &ah->btcoex_hw.mci; struct ath9k_channel *chan = ah->curchan; - static const u32 channelmap[] = { - 0x00000000, 0xffff0000, 0xffffffff, 0x7fffffff - }; + u32 channelmap[] = {0x00000000, 0xffff0000, 0xffffffff, 0x7fffffff}; int i; s16 chan_start, chan_end; u16 wlan_chan;