From 3b1313eb32c499d46dc4c3e896d19d9564c879c4 Mon Sep 17 00:00:00 2001 From: Vladimir Kondratiev Date: Sun, 24 Nov 2019 16:07:31 +0200 Subject: [PATCH 001/146] mips: cacheinfo: report shared CPU map Report L1 caches as shared per core; L2 - per cluster. This fixes "perf" that went crazy if shared_cpu_map attribute not reported on sysfs, in form of /sys/devices/system/cpu/cpu*/cache/index*/shared_cpu_list /sys/devices/system/cpu/cpu*/cache/index*/shared_cpu_map Signed-off-by: Vladimir Kondratiev Signed-off-by: Paul Burton Cc: Ralf Baechle Cc: James Hogan Cc: linux-mips@vger.kernel.org Cc: linux-kernel@vger.kernel.org --- arch/mips/kernel/cacheinfo.c | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/arch/mips/kernel/cacheinfo.c b/arch/mips/kernel/cacheinfo.c index f777e44653d5..47312c529410 100644 --- a/arch/mips/kernel/cacheinfo.c +++ b/arch/mips/kernel/cacheinfo.c @@ -50,6 +50,25 @@ static int __init_cache_level(unsigned int cpu) return 0; } +static void fill_cpumask_siblings(int cpu, cpumask_t *cpu_map) +{ + int cpu1; + + for_each_possible_cpu(cpu1) + if (cpus_are_siblings(cpu, cpu1)) + cpumask_set_cpu(cpu1, cpu_map); +} + +static void fill_cpumask_cluster(int cpu, cpumask_t *cpu_map) +{ + int cpu1; + int cluster = cpu_cluster(&cpu_data[cpu]); + + for_each_possible_cpu(cpu1) + if (cpu_cluster(&cpu_data[cpu1]) == cluster) + cpumask_set_cpu(cpu1, cpu_map); +} + static int __populate_cache_leaves(unsigned int cpu) { struct cpuinfo_mips *c = ¤t_cpu_data; @@ -57,14 +76,20 @@ static int __populate_cache_leaves(unsigned int cpu) struct cacheinfo *this_leaf = this_cpu_ci->info_list; if (c->icache.waysize) { + /* L1 caches are per core */ + fill_cpumask_siblings(cpu, &this_leaf->shared_cpu_map); populate_cache(dcache, this_leaf, 1, CACHE_TYPE_DATA); + fill_cpumask_siblings(cpu, &this_leaf->shared_cpu_map); populate_cache(icache, this_leaf, 1, CACHE_TYPE_INST); } else { populate_cache(dcache, this_leaf, 1, CACHE_TYPE_UNIFIED); } - if (c->scache.waysize) + if (c->scache.waysize) { + /* L2 cache is per cluster */ + fill_cpumask_cluster(cpu, &this_leaf->shared_cpu_map); populate_cache(scache, this_leaf, 2, CACHE_TYPE_UNIFIED); + } if (c->tcache.waysize) populate_cache(tcache, this_leaf, 3, CACHE_TYPE_UNIFIED); From 87f67cc4c7b81273fd1d9f775192eeb3124562f9 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Wed, 27 Nov 2019 07:26:12 -0800 Subject: [PATCH 002/146] MIPS: Fix boot on Fuloong2 systems MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 268a2d60013049 ("MIPS: Loongson64: Rename CPU TYPES") changed Kconfig symbols as follows: CPU_LOONGSON2 to CPU_LOONGSON2EF CPU_LOONGSON3 to CPU_LOONGSON64 SYS_HAS_CPU_LOONGSON3 to SYS_HAS_CPU_LOONGSON64 It did not touch SYS_HAS_CPU_LOONGSON2E or SYS_HAS_CPU_LOONGSON2F. However, the patch changed a conditional from #if defined(CONFIG_SYS_HAS_CPU_LOONGSON2E) || \ defined(CONFIG_SYS_HAS_CPU_LOONGSON2F) to #if defined(CONFIG_SYS_HAS_CPU_LOONGSON2EF) SYS_HAS_CPU_LOONGSON2EF does not exist, resulting in boot failures with the qemu fulong2e emulation. Revert to the original code. Fixes: 268a2d60013049 ("MIPS: Loongson64: Rename CPU TYPES") Cc: Jiaxun Yang Signed-off-by: Guenter Roeck Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Paul Burton Cc: Ralf Baechle Cc: James Hogan Cc: linux-mips@vger.kernel.org Cc: linux-kernel@vger.kernel.org --- arch/mips/include/asm/cpu-type.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/mips/include/asm/cpu-type.h b/arch/mips/include/asm/cpu-type.h index c46c59b0f1b4..49f0061a6051 100644 --- a/arch/mips/include/asm/cpu-type.h +++ b/arch/mips/include/asm/cpu-type.h @@ -15,7 +15,8 @@ static inline int __pure __get_cpu_type(const int cpu_type) { switch (cpu_type) { -#if defined(CONFIG_SYS_HAS_CPU_LOONGSON2EF) +#if defined(CONFIG_SYS_HAS_CPU_LOONGSON2E) || \ + defined(CONFIG_SYS_HAS_CPU_LOONGSON2F) case CPU_LOONGSON2EF: #endif From 7d2aa4bb90f5f6f1b8de8848c26042403f2d7bf9 Mon Sep 17 00:00:00 2001 From: Vincenzo Frascino Date: Fri, 29 Nov 2019 14:36:58 +0000 Subject: [PATCH 003/146] mips: Fix gettimeofday() in the vdso library The libc provides a discovery mechanism for vDSO library and its symbols. When a symbol is not exposed by the vDSOs the libc falls back on the system calls. With the introduction of the unified vDSO library on mips this behavior is not honored anymore by the kernel in the case of gettimeofday(). The issue has been noticed and reported due to a dhclient failure on the CI20 board: root@letux:~# dhclient ../../../../lib/isc/unix/time.c:200: Operation not permitted root@letux:~# Restore the original behavior fixing gettimeofday() in the vDSO library. Reported-by: H. Nikolaus Schaller Tested-by: H. Nikolaus Schaller # CI20 with JZ4780 Signed-off-by: Vincenzo Frascino Signed-off-by: Paul Burton Cc: mips-creator-ci20-dev@googlegroups.com Cc: letux-kernel@openphoenux.org Cc: linux-mips@vger.kernel.org Cc: linux-kernel@vger.kernel.org --- arch/mips/include/asm/vdso/gettimeofday.h | 13 ------------- arch/mips/vdso/vgettimeofday.c | 20 ++++++++++++++++++++ 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/arch/mips/include/asm/vdso/gettimeofday.h b/arch/mips/include/asm/vdso/gettimeofday.h index b08825531e9f..0ae9b4cbc153 100644 --- a/arch/mips/include/asm/vdso/gettimeofday.h +++ b/arch/mips/include/asm/vdso/gettimeofday.h @@ -26,8 +26,6 @@ #define __VDSO_USE_SYSCALL ULLONG_MAX -#ifdef CONFIG_MIPS_CLOCK_VSYSCALL - static __always_inline long gettimeofday_fallback( struct __kernel_old_timeval *_tv, struct timezone *_tz) @@ -48,17 +46,6 @@ static __always_inline long gettimeofday_fallback( return error ? -ret : ret; } -#else - -static __always_inline long gettimeofday_fallback( - struct __kernel_old_timeval *_tv, - struct timezone *_tz) -{ - return -1; -} - -#endif - static __always_inline long clock_gettime_fallback( clockid_t _clkid, struct __kernel_timespec *_ts) diff --git a/arch/mips/vdso/vgettimeofday.c b/arch/mips/vdso/vgettimeofday.c index 6ebdc37c89fc..6b83b6376a4b 100644 --- a/arch/mips/vdso/vgettimeofday.c +++ b/arch/mips/vdso/vgettimeofday.c @@ -17,12 +17,22 @@ int __vdso_clock_gettime(clockid_t clock, return __cvdso_clock_gettime32(clock, ts); } +#ifdef CONFIG_MIPS_CLOCK_VSYSCALL + +/* + * This is behind the ifdef so that we don't provide the symbol when there's no + * possibility of there being a usable clocksource, because there's nothing we + * can do without it. When libc fails the symbol lookup it should fall back on + * the standard syscall path. + */ int __vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz) { return __cvdso_gettimeofday(tv, tz); } +#endif /* CONFIG_MIPS_CLOCK_VSYSCALL */ + int __vdso_clock_getres(clockid_t clock_id, struct old_timespec32 *res) { @@ -43,12 +53,22 @@ int __vdso_clock_gettime(clockid_t clock, return __cvdso_clock_gettime(clock, ts); } +#ifdef CONFIG_MIPS_CLOCK_VSYSCALL + +/* + * This is behind the ifdef so that we don't provide the symbol when there's no + * possibility of there being a usable clocksource, because there's nothing we + * can do without it. When libc fails the symbol lookup it should fall back on + * the standard syscall path. + */ int __vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz) { return __cvdso_gettimeofday(tv, tz); } +#endif /* CONFIG_MIPS_CLOCK_VSYSCALL */ + int __vdso_clock_getres(clockid_t clock_id, struct __kernel_timespec *res) { From a7effde99b6e8119b28c235eda5419c2e03b3089 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 28 Nov 2019 09:13:26 -0800 Subject: [PATCH 004/146] MIPS: Kconfig: Use correct form for 'depends on' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the CONFIG_ prefix from "depends on" as it makes the selection not possible. Signed-off-by: Joe Perches Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Paul Burton Cc: Ralf Baechle Cc: James Hogan Cc: linux-mips@vger.kernel.org Cc: LKML --- drivers/platform/mips/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/mips/Kconfig b/drivers/platform/mips/Kconfig index f4d0a86c00d0..5e77b0dc5fd6 100644 --- a/drivers/platform/mips/Kconfig +++ b/drivers/platform/mips/Kconfig @@ -18,7 +18,7 @@ if MIPS_PLATFORM_DEVICES config CPU_HWMON tristate "Loongson-3 CPU HWMon Driver" - depends on CONFIG_MACH_LOONGSON64 + depends on MACH_LOONGSON64 select HWMON default y help From 0b8d616fb5a8ffa307b1d3af37f55c15dae14f28 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 9 Oct 2019 13:48:09 +0200 Subject: [PATCH 005/146] taskstats: fix data-race When assiging and testing taskstats in taskstats_exit() there's a race when setting up and reading sig->stats when a thread-group with more than one thread exits: write to 0xffff8881157bbe10 of 8 bytes by task 7951 on cpu 0: taskstats_tgid_alloc kernel/taskstats.c:567 [inline] taskstats_exit+0x6b7/0x717 kernel/taskstats.c:596 do_exit+0x2c2/0x18e0 kernel/exit.c:864 do_group_exit+0xb4/0x1c0 kernel/exit.c:983 get_signal+0x2a2/0x1320 kernel/signal.c:2734 do_signal+0x3b/0xc00 arch/x86/kernel/signal.c:815 exit_to_usermode_loop+0x250/0x2c0 arch/x86/entry/common.c:159 prepare_exit_to_usermode arch/x86/entry/common.c:194 [inline] syscall_return_slowpath arch/x86/entry/common.c:274 [inline] do_syscall_64+0x2d7/0x2f0 arch/x86/entry/common.c:299 entry_SYSCALL_64_after_hwframe+0x44/0xa9 read to 0xffff8881157bbe10 of 8 bytes by task 7949 on cpu 1: taskstats_tgid_alloc kernel/taskstats.c:559 [inline] taskstats_exit+0xb2/0x717 kernel/taskstats.c:596 do_exit+0x2c2/0x18e0 kernel/exit.c:864 do_group_exit+0xb4/0x1c0 kernel/exit.c:983 __do_sys_exit_group kernel/exit.c:994 [inline] __se_sys_exit_group kernel/exit.c:992 [inline] __x64_sys_exit_group+0x2e/0x30 kernel/exit.c:992 do_syscall_64+0xcf/0x2f0 arch/x86/entry/common.c:296 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fix this by using smp_load_acquire() and smp_store_release(). Reported-by: syzbot+c5d03165a1bd1dead0c1@syzkaller.appspotmail.com Fixes: 34ec12349c8a ("taskstats: cleanup ->signal->stats allocation") Cc: stable@vger.kernel.org Signed-off-by: Christian Brauner Acked-by: Marco Elver Reviewed-by: Will Deacon Reviewed-by: Andrea Parri Reviewed-by: Dmitry Vyukov Link: https://lore.kernel.org/r/20191009114809.8643-1-christian.brauner@ubuntu.com --- kernel/taskstats.c | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 13a0f2e6ebc2..e2ac0e37c4ae 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -554,25 +554,33 @@ static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk) { struct signal_struct *sig = tsk->signal; - struct taskstats *stats; + struct taskstats *stats_new, *stats; - if (sig->stats || thread_group_empty(tsk)) - goto ret; + /* Pairs with smp_store_release() below. */ + stats = smp_load_acquire(&sig->stats); + if (stats || thread_group_empty(tsk)) + return stats; /* No problem if kmem_cache_zalloc() fails */ - stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL); + stats_new = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL); spin_lock_irq(&tsk->sighand->siglock); - if (!sig->stats) { - sig->stats = stats; - stats = NULL; + stats = sig->stats; + if (!stats) { + /* + * Pairs with smp_store_release() above and order the + * kmem_cache_zalloc(). + */ + smp_store_release(&sig->stats, stats_new); + stats = stats_new; + stats_new = NULL; } spin_unlock_irq(&tsk->sighand->siglock); - if (stats) - kmem_cache_free(taskstats_cache, stats); -ret: - return sig->stats; + if (stats_new) + kmem_cache_free(taskstats_cache, stats_new); + + return stats; } /* Send pid data out on exit */ From 13b0ba33d78b32384ea38abd03bc2b795fcab194 Mon Sep 17 00:00:00 2001 From: Pi-Hsun Shih Date: Mon, 18 Nov 2019 14:18:05 +0800 Subject: [PATCH 006/146] drm/mediatek: Check return value of mtk_drm_ddp_comp_for_plane. The mtk_drm_ddp_comp_for_plane can return NULL, but the usage doesn't check for it. Add check for it. Fixes: d6b53f68356f ("drm/mediatek: Add helper to get component for a plane") Signed-off-by: Pi-Hsun Shih Signed-off-by: CK Hu --- drivers/gpu/drm/mediatek/mtk_drm_crtc.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/mediatek/mtk_drm_crtc.c b/drivers/gpu/drm/mediatek/mtk_drm_crtc.c index f80a8ba75977..4c4f976c994e 100644 --- a/drivers/gpu/drm/mediatek/mtk_drm_crtc.c +++ b/drivers/gpu/drm/mediatek/mtk_drm_crtc.c @@ -310,7 +310,9 @@ static int mtk_crtc_ddp_hw_init(struct mtk_drm_crtc *mtk_crtc) plane_state = to_mtk_plane_state(plane->state); comp = mtk_drm_ddp_comp_for_plane(crtc, plane, &local_layer); - mtk_ddp_comp_layer_config(comp, local_layer, plane_state); + if (comp) + mtk_ddp_comp_layer_config(comp, local_layer, + plane_state); } return 0; @@ -386,8 +388,9 @@ static void mtk_crtc_ddp_config(struct drm_crtc *crtc) comp = mtk_drm_ddp_comp_for_plane(crtc, plane, &local_layer); - mtk_ddp_comp_layer_config(comp, local_layer, - plane_state); + if (comp) + mtk_ddp_comp_layer_config(comp, local_layer, + plane_state); plane_state->pending.config = false; } mtk_crtc->pending_planes = false; @@ -401,7 +404,9 @@ int mtk_drm_crtc_plane_check(struct drm_crtc *crtc, struct drm_plane *plane, struct mtk_ddp_comp *comp; comp = mtk_drm_ddp_comp_for_plane(crtc, plane, &local_layer); - return mtk_ddp_comp_layer_check(comp, local_layer, state); + if (comp) + return mtk_ddp_comp_layer_check(comp, local_layer, state); + return 0; } static void mtk_drm_crtc_atomic_enable(struct drm_crtc *crtc, From 92c17f6043647652e1f4a772636e4288d4ecea3e Mon Sep 17 00:00:00 2001 From: Yongqiang Niu Date: Wed, 27 Nov 2019 18:04:19 +0800 Subject: [PATCH 007/146] drm/mediatek: Fix can't get component for external display plane. The original logic is ok for primary display, but will not find out component for external display. For example, plane->index is 6 for external display, but there are only 2 layer nr in external display, and this condition will never happen: if (plane->index < (count + mtk_ddp_comp_layer_nr(comp))) Fix this by using the offset of the plane to mtk_crtc->planes as index, instead of plane->index. Fixes: d6b53f68356f ("drm/mediatek: Add helper to get component for a plane") Signed-off-by: Yongqiang Niu Signed-off-by: Pi-Hsun Shih Signed-off-by: CK Hu --- drivers/gpu/drm/mediatek/mtk_drm_crtc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/mediatek/mtk_drm_crtc.c b/drivers/gpu/drm/mediatek/mtk_drm_crtc.c index 4c4f976c994e..3305a94fc930 100644 --- a/drivers/gpu/drm/mediatek/mtk_drm_crtc.c +++ b/drivers/gpu/drm/mediatek/mtk_drm_crtc.c @@ -215,11 +215,12 @@ struct mtk_ddp_comp *mtk_drm_ddp_comp_for_plane(struct drm_crtc *crtc, struct mtk_drm_crtc *mtk_crtc = to_mtk_crtc(crtc); struct mtk_ddp_comp *comp; int i, count = 0; + unsigned int local_index = plane - mtk_crtc->planes; for (i = 0; i < mtk_crtc->ddp_comp_nr; i++) { comp = mtk_crtc->ddp_comp[i]; - if (plane->index < (count + mtk_ddp_comp_layer_nr(comp))) { - *local_layer = plane->index - count; + if (local_index < (count + mtk_ddp_comp_layer_nr(comp))) { + *local_layer = local_index - count; return comp; } count += mtk_ddp_comp_layer_nr(comp); From 53a256a9b925b47c7e67fc1f16ca41561a7b877c Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Thu, 5 Dec 2019 12:54:49 +0100 Subject: [PATCH 008/146] dmaengine: Fix access to uninitialized dma_slave_caps dmaengine_desc_set_reuse() allocates a struct dma_slave_caps on the stack, populates it using dma_get_slave_caps() and then accesses one of its members. However dma_get_slave_caps() may fail and this isn't accounted for, leading to a legitimate warning of gcc-4.9 (but not newer versions): In file included from drivers/spi/spi-bcm2835.c:19:0: drivers/spi/spi-bcm2835.c: In function 'dmaengine_desc_set_reuse': >> include/linux/dmaengine.h:1370:10: warning: 'caps.descriptor_reuse' is used uninitialized in this function [-Wuninitialized] if (caps.descriptor_reuse) { Fix it, thereby also silencing the gcc-4.9 warning. The issue has been present for 4 years but surfaces only now that the first caller of dmaengine_desc_set_reuse() has been added in spi-bcm2835.c. Another user of reusable DMA descriptors has existed for a while in pxa_camera.c, but it sets the DMA_CTRL_REUSE flag directly instead of calling dmaengine_desc_set_reuse(). Nevertheless, tag this commit for stable in case there are out-of-tree users. Fixes: 272420214d26 ("dmaengine: Add DMA_CTRL_REUSE") Reported-by: kbuild test robot Signed-off-by: Lukas Wunner Cc: stable@vger.kernel.org # v4.3+ Link: https://lore.kernel.org/r/ca92998ccc054b4f2bfd60ef3adbab2913171eac.1575546234.git.lukas@wunner.de Signed-off-by: Vinod Koul --- include/linux/dmaengine.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index 8fcdee1c0cf9..dad4a68fa009 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -1364,8 +1364,11 @@ static inline int dma_get_slave_caps(struct dma_chan *chan, static inline int dmaengine_desc_set_reuse(struct dma_async_tx_descriptor *tx) { struct dma_slave_caps caps; + int ret; - dma_get_slave_caps(tx->chan, &caps); + ret = dma_get_slave_caps(tx->chan, &caps); + if (ret) + return ret; if (caps.descriptor_reuse) { tx->flags |= DMA_CTRL_REUSE; From 6f7c41374b62fd80bbd8aae3536c43688c54d95e Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Mon, 25 Nov 2019 10:46:51 +0900 Subject: [PATCH 009/146] tomoyo: Don't use nifty names on sockets. syzbot is reporting that use of SOCKET_I()->sk from open() can result in use after free problem [1], for socket's inode is still reachable via /proc/pid/fd/n despite destruction of SOCKET_I()->sk already completed. At first I thought that this race condition applies to only open/getattr permission checks. But James Morris has pointed out that there are more permission checks where this race condition applies to. Thus, get rid of tomoyo_get_socket_name() instead of conditionally bypassing permission checks on sockets. As a side effect of this patch, "socket:[family=\$:type=\$:protocol=\$]" in the policy files has to be rewritten to "socket:[\$]". [1] https://syzkaller.appspot.com/bug?id=73d590010454403d55164cca23bd0565b1eb3b74 Signed-off-by: Tetsuo Handa Reported-by: syzbot Reported-by: James Morris --- security/tomoyo/realpath.c | 32 +------------------------------- 1 file changed, 1 insertion(+), 31 deletions(-) diff --git a/security/tomoyo/realpath.c b/security/tomoyo/realpath.c index e7832448d721..bf38fc1b59b2 100644 --- a/security/tomoyo/realpath.c +++ b/security/tomoyo/realpath.c @@ -217,31 +217,6 @@ static char *tomoyo_get_local_path(struct dentry *dentry, char * const buffer, return ERR_PTR(-ENOMEM); } -/** - * tomoyo_get_socket_name - Get the name of a socket. - * - * @path: Pointer to "struct path". - * @buffer: Pointer to buffer to return value in. - * @buflen: Sizeof @buffer. - * - * Returns the buffer. - */ -static char *tomoyo_get_socket_name(const struct path *path, char * const buffer, - const int buflen) -{ - struct inode *inode = d_backing_inode(path->dentry); - struct socket *sock = inode ? SOCKET_I(inode) : NULL; - struct sock *sk = sock ? sock->sk : NULL; - - if (sk) { - snprintf(buffer, buflen, "socket:[family=%u:type=%u:protocol=%u]", - sk->sk_family, sk->sk_type, sk->sk_protocol); - } else { - snprintf(buffer, buflen, "socket:[unknown]"); - } - return buffer; -} - /** * tomoyo_realpath_from_path - Returns realpath(3) of the given pathname but ignores chroot'ed root. * @@ -279,12 +254,7 @@ char *tomoyo_realpath_from_path(const struct path *path) break; /* To make sure that pos is '\0' terminated. */ buf[buf_len - 1] = '\0'; - /* Get better name for socket. */ - if (sb->s_magic == SOCKFS_MAGIC) { - pos = tomoyo_get_socket_name(path, buf, buf_len - 1); - goto encode; - } - /* For "pipe:[\$]". */ + /* For "pipe:[\$]" and "socket:[\$]". */ if (dentry->d_op && dentry->d_op->d_dname) { pos = dentry->d_op->d_dname(dentry, buf, buf_len - 1); goto encode; From a40c94be2336f3002563c9ae16572143ae3422e2 Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Tue, 10 Dec 2019 17:55:45 +0100 Subject: [PATCH 010/146] dmaengine: dma-jz4780: Also break descriptor chains on JZ4725B It turns out that the JZ4725B displays the same buggy behaviour as the JZ4740 that was described in commit f4c255f1a747 ("dmaengine: dma-jz4780: Break descriptor chains on JZ4740"). Work around it by using the same workaround previously used for the JZ4740. Fixes commit f4c255f1a747 ("dmaengine: dma-jz4780: Break descriptor chains on JZ4740") Cc: Signed-off-by: Paul Cercueil Link: https://lore.kernel.org/r/20191210165545.59690-1-paul@crapouillou.net Signed-off-by: Vinod Koul --- drivers/dma/dma-jz4780.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/dma/dma-jz4780.c b/drivers/dma/dma-jz4780.c index fa626acdc9b9..44af435628f8 100644 --- a/drivers/dma/dma-jz4780.c +++ b/drivers/dma/dma-jz4780.c @@ -999,7 +999,8 @@ static const struct jz4780_dma_soc_data jz4740_dma_soc_data = { static const struct jz4780_dma_soc_data jz4725b_dma_soc_data = { .nb_channels = 6, .transfer_ord_max = 5, - .flags = JZ_SOC_DATA_PER_CHAN_PM | JZ_SOC_DATA_NO_DCKES_DCKEC, + .flags = JZ_SOC_DATA_PER_CHAN_PM | JZ_SOC_DATA_NO_DCKES_DCKEC | + JZ_SOC_DATA_BREAK_LINKS, }; static const struct jz4780_dma_soc_data jz4770_dma_soc_data = { From cec935ce69fc386f13959578deb40963ebbb85c3 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Wed, 4 Dec 2019 08:52:08 +0100 Subject: [PATCH 011/146] media: cec: CEC 2.0-only bcast messages were ignored Some messages are allowed to be a broadcast message in CEC 2.0 only, and should be ignored by CEC 1.4 devices. Unfortunately, the check was wrong, causing such messages to be marked as invalid under CEC 2.0. Signed-off-by: Hans Verkuil Cc: # for v4.10 and up Signed-off-by: Mauro Carvalho Chehab --- drivers/media/cec/cec-adap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/media/cec/cec-adap.c b/drivers/media/cec/cec-adap.c index 9340435a94a0..e90c30dac68b 100644 --- a/drivers/media/cec/cec-adap.c +++ b/drivers/media/cec/cec-adap.c @@ -1085,11 +1085,11 @@ void cec_received_msg_ts(struct cec_adapter *adap, valid_la = false; else if (!cec_msg_is_broadcast(msg) && !(dir_fl & DIRECTED)) valid_la = false; - else if (cec_msg_is_broadcast(msg) && !(dir_fl & BCAST1_4)) + else if (cec_msg_is_broadcast(msg) && !(dir_fl & BCAST)) valid_la = false; else if (cec_msg_is_broadcast(msg) && - adap->log_addrs.cec_version >= CEC_OP_CEC_VERSION_2_0 && - !(dir_fl & BCAST2_0)) + adap->log_addrs.cec_version < CEC_OP_CEC_VERSION_2_0 && + !(dir_fl & BCAST1_4)) valid_la = false; } if (valid_la && min_len) { From 95c29d46ab2a517e4c26d0a07300edca6768db17 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Sat, 7 Dec 2019 23:48:09 +0100 Subject: [PATCH 012/146] media: cec: avoid decrementing transmit_queue_sz if it is 0 WARN if transmit_queue_sz is 0 but do not decrement it. The CEC adapter will become unresponsive if it goes below 0 since then it thinks there are 4 billion messages in the queue. Obviously this should not happen, but a driver bug could cause this. Signed-off-by: Hans Verkuil Cc: # for v4.12 and up Signed-off-by: Mauro Carvalho Chehab --- drivers/media/cec/cec-adap.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/media/cec/cec-adap.c b/drivers/media/cec/cec-adap.c index e90c30dac68b..1060e633b623 100644 --- a/drivers/media/cec/cec-adap.c +++ b/drivers/media/cec/cec-adap.c @@ -380,7 +380,8 @@ static void cec_data_cancel(struct cec_data *data, u8 tx_status) } else { list_del_init(&data->list); if (!(data->msg.tx_status & CEC_TX_STATUS_OK)) - data->adap->transmit_queue_sz--; + if (!WARN_ON(!data->adap->transmit_queue_sz)) + data->adap->transmit_queue_sz--; } if (data->msg.tx_status & CEC_TX_STATUS_OK) { @@ -432,6 +433,14 @@ static void cec_flush(struct cec_adapter *adap) * need to do anything special in that case. */ } + /* + * If something went wrong and this counter isn't what it should + * be, then this will reset it back to 0. Warn if it is not 0, + * since it indicates a bug, either in this framework or in a + * CEC driver. + */ + if (WARN_ON(adap->transmit_queue_sz)) + adap->transmit_queue_sz = 0; } /* @@ -522,7 +531,8 @@ int cec_thread_func(void *_adap) data = list_first_entry(&adap->transmit_queue, struct cec_data, list); list_del_init(&data->list); - adap->transmit_queue_sz--; + if (!WARN_ON(!data->adap->transmit_queue_sz)) + adap->transmit_queue_sz--; /* Make this the current transmitting message */ adap->transmitting = data; From ac479b51f3f4aaa852b5d3f00ecfb9290230cf64 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Wed, 11 Dec 2019 12:47:57 +0100 Subject: [PATCH 013/146] media: cec: check 'transmit_in_progress', not 'transmitting' Currently wait_event_interruptible_timeout is called in cec_thread_func() when adap->transmitting is set. But if the adapter is unconfigured while transmitting, then adap->transmitting is set to NULL. But the hardware is still actually transmitting the message, and that's indicated by adap->transmit_in_progress and we should wait until that is finished or times out before transmitting new messages. As the original commit says: adap->transmitting is the userspace view, adap->transmit_in_progress reflects the hardware state. However, if adap->transmitting is NULL and adap->transmit_in_progress is true, then wait_event_interruptible is called (no timeout), which can get stuck indefinitely if the CEC driver is flaky and never marks the transmit-in-progress as 'done'. So test against transmit_in_progress when deciding whether to use the timeout variant or not, instead of testing against adap->transmitting. Signed-off-by: Hans Verkuil Fixes: 32804fcb612b ("media: cec: keep track of outstanding transmits") Cc: # for v4.19 and up Signed-off-by: Mauro Carvalho Chehab --- drivers/media/cec/cec-adap.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/drivers/media/cec/cec-adap.c b/drivers/media/cec/cec-adap.c index 1060e633b623..6c95dc471d4c 100644 --- a/drivers/media/cec/cec-adap.c +++ b/drivers/media/cec/cec-adap.c @@ -465,7 +465,7 @@ int cec_thread_func(void *_adap) bool timeout = false; u8 attempts; - if (adap->transmitting) { + if (adap->transmit_in_progress) { int err; /* @@ -500,7 +500,7 @@ int cec_thread_func(void *_adap) goto unlock; } - if (adap->transmitting && timeout) { + if (adap->transmit_in_progress && timeout) { /* * If we timeout, then log that. Normally this does * not happen and it is an indication of a faulty CEC @@ -509,14 +509,18 @@ int cec_thread_func(void *_adap) * so much traffic on the bus that the adapter was * unable to transmit for CEC_XFER_TIMEOUT_MS (2.1s). */ - pr_warn("cec-%s: message %*ph timed out\n", adap->name, - adap->transmitting->msg.len, - adap->transmitting->msg.msg); + if (adap->transmitting) { + pr_warn("cec-%s: message %*ph timed out\n", adap->name, + adap->transmitting->msg.len, + adap->transmitting->msg.msg); + /* Just give up on this. */ + cec_data_cancel(adap->transmitting, + CEC_TX_STATUS_TIMEOUT); + } else { + pr_warn("cec-%s: transmit timed out\n", adap->name); + } adap->transmit_in_progress = false; adap->tx_timeouts++; - /* Just give up on this. */ - cec_data_cancel(adap->transmitting, - CEC_TX_STATUS_TIMEOUT); goto unlock; } From e5a52a1d15c79bb48a430fb263852263ec1d3f11 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Sat, 7 Dec 2019 23:43:23 +0100 Subject: [PATCH 014/146] media: pulse8-cec: fix lost cec_transmit_attempt_done() call The periodic PING command could interfere with the result of a CEC transmit, causing a lost cec_transmit_attempt_done() call. Signed-off-by: Hans Verkuil Cc: # for v4.10 and up Signed-off-by: Mauro Carvalho Chehab --- drivers/media/usb/pulse8-cec/pulse8-cec.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/drivers/media/usb/pulse8-cec/pulse8-cec.c b/drivers/media/usb/pulse8-cec/pulse8-cec.c index ac88ade94cda..59609556d969 100644 --- a/drivers/media/usb/pulse8-cec/pulse8-cec.c +++ b/drivers/media/usb/pulse8-cec/pulse8-cec.c @@ -116,6 +116,7 @@ struct pulse8 { unsigned int vers; struct completion cmd_done; struct work_struct work; + u8 work_result; struct delayed_work ping_eeprom_work; struct cec_msg rx_msg; u8 data[DATA_SIZE]; @@ -137,8 +138,10 @@ static void pulse8_irq_work_handler(struct work_struct *work) { struct pulse8 *pulse8 = container_of(work, struct pulse8, work); + u8 result = pulse8->work_result; - switch (pulse8->data[0] & 0x3f) { + pulse8->work_result = 0; + switch (result & 0x3f) { case MSGCODE_FRAME_DATA: cec_received_msg(pulse8->adap, &pulse8->rx_msg); break; @@ -172,12 +175,12 @@ static irqreturn_t pulse8_interrupt(struct serio *serio, unsigned char data, pulse8->escape = false; } else if (data == MSGEND) { struct cec_msg *msg = &pulse8->rx_msg; + u8 msgcode = pulse8->buf[0]; if (debug) dev_info(pulse8->dev, "received: %*ph\n", pulse8->idx, pulse8->buf); - pulse8->data[0] = pulse8->buf[0]; - switch (pulse8->buf[0] & 0x3f) { + switch (msgcode & 0x3f) { case MSGCODE_FRAME_START: msg->len = 1; msg->msg[0] = pulse8->buf[1]; @@ -186,14 +189,20 @@ static irqreturn_t pulse8_interrupt(struct serio *serio, unsigned char data, if (msg->len == CEC_MAX_MSG_SIZE) break; msg->msg[msg->len++] = pulse8->buf[1]; - if (pulse8->buf[0] & MSGCODE_FRAME_EOM) + if (msgcode & MSGCODE_FRAME_EOM) { + WARN_ON(pulse8->work_result); + pulse8->work_result = msgcode; schedule_work(&pulse8->work); + break; + } break; case MSGCODE_TRANSMIT_SUCCEEDED: case MSGCODE_TRANSMIT_FAILED_LINE: case MSGCODE_TRANSMIT_FAILED_ACK: case MSGCODE_TRANSMIT_FAILED_TIMEOUT_DATA: case MSGCODE_TRANSMIT_FAILED_TIMEOUT_LINE: + WARN_ON(pulse8->work_result); + pulse8->work_result = msgcode; schedule_work(&pulse8->work); break; case MSGCODE_HIGH_ERROR: From 6bd5ce6089b561f5392460bfb654dea89356ab1b Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Mon, 16 Dec 2019 19:16:48 +0900 Subject: [PATCH 015/146] tomoyo: Suppress RCU warning at list_for_each_entry_rcu(). John Garry has reported that allmodconfig kernel on arm64 causes flood of "RCU-list traversed in non-reader section!!" warning. I don't know what change caused this warning, but this warning is safe because TOMOYO uses SRCU lock instead. Let's suppress this warning by explicitly telling that the caller is holding SRCU lock. Reported-and-tested-by: John Garry Signed-off-by: Tetsuo Handa --- security/tomoyo/common.c | 9 ++++++--- security/tomoyo/domain.c | 15 ++++++++++----- security/tomoyo/group.c | 9 ++++++--- security/tomoyo/util.c | 6 ++++-- 4 files changed, 26 insertions(+), 13 deletions(-) diff --git a/security/tomoyo/common.c b/security/tomoyo/common.c index dd3d5942e669..c36bafbcd77e 100644 --- a/security/tomoyo/common.c +++ b/security/tomoyo/common.c @@ -951,7 +951,8 @@ static bool tomoyo_manager(void) exe = tomoyo_get_exe(); if (!exe) return false; - list_for_each_entry_rcu(ptr, &tomoyo_kernel_namespace.policy_list[TOMOYO_ID_MANAGER], head.list) { + list_for_each_entry_rcu(ptr, &tomoyo_kernel_namespace.policy_list[TOMOYO_ID_MANAGER], head.list, + srcu_read_lock_held(&tomoyo_ss)) { if (!ptr->head.is_deleted && (!tomoyo_pathcmp(domainname, ptr->manager) || !strcmp(exe, ptr->manager->name))) { @@ -1095,7 +1096,8 @@ static int tomoyo_delete_domain(char *domainname) if (mutex_lock_interruptible(&tomoyo_policy_lock)) return -EINTR; /* Is there an active domain? */ - list_for_each_entry_rcu(domain, &tomoyo_domain_list, list) { + list_for_each_entry_rcu(domain, &tomoyo_domain_list, list, + srcu_read_lock_held(&tomoyo_ss)) { /* Never delete tomoyo_kernel_domain */ if (domain == &tomoyo_kernel_domain) continue; @@ -2778,7 +2780,8 @@ void tomoyo_check_profile(void) tomoyo_policy_loaded = true; pr_info("TOMOYO: 2.6.0\n"); - list_for_each_entry_rcu(domain, &tomoyo_domain_list, list) { + list_for_each_entry_rcu(domain, &tomoyo_domain_list, list, + srcu_read_lock_held(&tomoyo_ss)) { const u8 profile = domain->profile; struct tomoyo_policy_namespace *ns = domain->ns; diff --git a/security/tomoyo/domain.c b/security/tomoyo/domain.c index 8526a0a74023..7869d6a9980b 100644 --- a/security/tomoyo/domain.c +++ b/security/tomoyo/domain.c @@ -41,7 +41,8 @@ int tomoyo_update_policy(struct tomoyo_acl_head *new_entry, const int size, if (mutex_lock_interruptible(&tomoyo_policy_lock)) return -ENOMEM; - list_for_each_entry_rcu(entry, list, list) { + list_for_each_entry_rcu(entry, list, list, + srcu_read_lock_held(&tomoyo_ss)) { if (entry->is_deleted == TOMOYO_GC_IN_PROGRESS) continue; if (!check_duplicate(entry, new_entry)) @@ -119,7 +120,8 @@ int tomoyo_update_domain(struct tomoyo_acl_info *new_entry, const int size, } if (mutex_lock_interruptible(&tomoyo_policy_lock)) goto out; - list_for_each_entry_rcu(entry, list, list) { + list_for_each_entry_rcu(entry, list, list, + srcu_read_lock_held(&tomoyo_ss)) { if (entry->is_deleted == TOMOYO_GC_IN_PROGRESS) continue; if (!tomoyo_same_acl_head(entry, new_entry) || @@ -166,7 +168,8 @@ void tomoyo_check_acl(struct tomoyo_request_info *r, u16 i = 0; retry: - list_for_each_entry_rcu(ptr, list, list) { + list_for_each_entry_rcu(ptr, list, list, + srcu_read_lock_held(&tomoyo_ss)) { if (ptr->is_deleted || ptr->type != r->param_type) continue; if (!check_entry(r, ptr)) @@ -298,7 +301,8 @@ static inline bool tomoyo_scan_transition { const struct tomoyo_transition_control *ptr; - list_for_each_entry_rcu(ptr, list, head.list) { + list_for_each_entry_rcu(ptr, list, head.list, + srcu_read_lock_held(&tomoyo_ss)) { if (ptr->head.is_deleted || ptr->type != type) continue; if (ptr->domainname) { @@ -735,7 +739,8 @@ int tomoyo_find_next_domain(struct linux_binprm *bprm) /* Check 'aggregator' directive. */ candidate = &exename; - list_for_each_entry_rcu(ptr, list, head.list) { + list_for_each_entry_rcu(ptr, list, head.list, + srcu_read_lock_held(&tomoyo_ss)) { if (ptr->head.is_deleted || !tomoyo_path_matches_pattern(&exename, ptr->original_name)) diff --git a/security/tomoyo/group.c b/security/tomoyo/group.c index a37c7dc66e44..1cecdd797597 100644 --- a/security/tomoyo/group.c +++ b/security/tomoyo/group.c @@ -133,7 +133,8 @@ tomoyo_path_matches_group(const struct tomoyo_path_info *pathname, { struct tomoyo_path_group *member; - list_for_each_entry_rcu(member, &group->member_list, head.list) { + list_for_each_entry_rcu(member, &group->member_list, head.list, + srcu_read_lock_held(&tomoyo_ss)) { if (member->head.is_deleted) continue; if (!tomoyo_path_matches_pattern(pathname, member->member_name)) @@ -161,7 +162,8 @@ bool tomoyo_number_matches_group(const unsigned long min, struct tomoyo_number_group *member; bool matched = false; - list_for_each_entry_rcu(member, &group->member_list, head.list) { + list_for_each_entry_rcu(member, &group->member_list, head.list, + srcu_read_lock_held(&tomoyo_ss)) { if (member->head.is_deleted) continue; if (min > member->number.values[1] || @@ -191,7 +193,8 @@ bool tomoyo_address_matches_group(const bool is_ipv6, const __be32 *address, bool matched = false; const u8 size = is_ipv6 ? 16 : 4; - list_for_each_entry_rcu(member, &group->member_list, head.list) { + list_for_each_entry_rcu(member, &group->member_list, head.list, + srcu_read_lock_held(&tomoyo_ss)) { if (member->head.is_deleted) continue; if (member->address.is_ipv6 != is_ipv6) diff --git a/security/tomoyo/util.c b/security/tomoyo/util.c index 52752e1a84ed..eba0b3395851 100644 --- a/security/tomoyo/util.c +++ b/security/tomoyo/util.c @@ -594,7 +594,8 @@ struct tomoyo_domain_info *tomoyo_find_domain(const char *domainname) name.name = domainname; tomoyo_fill_path_info(&name); - list_for_each_entry_rcu(domain, &tomoyo_domain_list, list) { + list_for_each_entry_rcu(domain, &tomoyo_domain_list, list, + srcu_read_lock_held(&tomoyo_ss)) { if (!domain->is_deleted && !tomoyo_pathcmp(&name, domain->domainname)) return domain; @@ -1028,7 +1029,8 @@ bool tomoyo_domain_quota_is_ok(struct tomoyo_request_info *r) return false; if (!domain) return true; - list_for_each_entry_rcu(ptr, &domain->acl_info_list, list) { + list_for_each_entry_rcu(ptr, &domain->acl_info_list, list, + srcu_read_lock_held(&tomoyo_ss)) { u16 perm; u8 i; From e18e0f6b7c8f220774dd68965e8a9b046905acc8 Mon Sep 17 00:00:00 2001 From: Jitao Shi Date: Fri, 13 Dec 2019 17:52:15 +0800 Subject: [PATCH 016/146] drm/mediatek: reduce the hbp and hfp for phy timing There are some extra data transfer in dsi. ex. LPX, hs_prepare, hs_zero, hs_exit and the sof/eof of dsi packet. This signal will enlarge the line time. So the real frame on dsi bus will be lower than calc by video timing. So dsi driver reduces the hbp and hfp to keep the line time. Fixes: 7a5bc4e22ecf ("drm/mediatek: change the dsi phytiming calculate method") Signed-off-by: Jitao Shi Tested-by: Hsin-Yi Wang Tested-by: Enric Balletbo i Serra Signed-off-by: CK Hu --- drivers/gpu/drm/mediatek/mtk_dsi.c | 67 +++++++++++++++++------------- 1 file changed, 38 insertions(+), 29 deletions(-) diff --git a/drivers/gpu/drm/mediatek/mtk_dsi.c b/drivers/gpu/drm/mediatek/mtk_dsi.c index e9931bbbe846..d77c9f484ce3 100644 --- a/drivers/gpu/drm/mediatek/mtk_dsi.c +++ b/drivers/gpu/drm/mediatek/mtk_dsi.c @@ -230,28 +230,25 @@ static void mtk_dsi_mask(struct mtk_dsi *dsi, u32 offset, u32 mask, u32 data) static void mtk_dsi_phy_timconfig(struct mtk_dsi *dsi) { u32 timcon0, timcon1, timcon2, timcon3; - u32 ui, cycle_time; + u32 data_rate_mhz = DIV_ROUND_UP(dsi->data_rate, 1000000); struct mtk_phy_timing *timing = &dsi->phy_timing; - ui = DIV_ROUND_UP(1000000000, dsi->data_rate); - cycle_time = div_u64(8000000000ULL, dsi->data_rate); + timing->lpx = (60 * data_rate_mhz / (8 * 1000)) + 1; + timing->da_hs_prepare = (80 * data_rate_mhz + 4 * 1000) / 8000; + timing->da_hs_zero = (170 * data_rate_mhz + 10 * 1000) / 8000 + 1 - + timing->da_hs_prepare; + timing->da_hs_trail = timing->da_hs_prepare + 1; - timing->lpx = NS_TO_CYCLE(60, cycle_time); - timing->da_hs_prepare = NS_TO_CYCLE(50 + 5 * ui, cycle_time); - timing->da_hs_zero = NS_TO_CYCLE(110 + 6 * ui, cycle_time); - timing->da_hs_trail = NS_TO_CYCLE(77 + 4 * ui, cycle_time); + timing->ta_go = 4 * timing->lpx - 2; + timing->ta_sure = timing->lpx + 2; + timing->ta_get = 4 * timing->lpx; + timing->da_hs_exit = 2 * timing->lpx + 1; - timing->ta_go = 4 * timing->lpx; - timing->ta_sure = 3 * timing->lpx / 2; - timing->ta_get = 5 * timing->lpx; - timing->da_hs_exit = 2 * timing->lpx; - - timing->clk_hs_zero = NS_TO_CYCLE(336, cycle_time); - timing->clk_hs_trail = NS_TO_CYCLE(100, cycle_time) + 10; - - timing->clk_hs_prepare = NS_TO_CYCLE(64, cycle_time); - timing->clk_hs_post = NS_TO_CYCLE(80 + 52 * ui, cycle_time); - timing->clk_hs_exit = 2 * timing->lpx; + timing->clk_hs_prepare = 70 * data_rate_mhz / (8 * 1000); + timing->clk_hs_post = timing->clk_hs_prepare + 8; + timing->clk_hs_trail = timing->clk_hs_prepare; + timing->clk_hs_zero = timing->clk_hs_trail * 4; + timing->clk_hs_exit = 2 * timing->clk_hs_trail; timcon0 = timing->lpx | timing->da_hs_prepare << 8 | timing->da_hs_zero << 16 | timing->da_hs_trail << 24; @@ -482,27 +479,39 @@ static void mtk_dsi_config_vdo_timing(struct mtk_dsi *dsi) dsi_tmp_buf_bpp - 10); data_phy_cycles = timing->lpx + timing->da_hs_prepare + - timing->da_hs_zero + timing->da_hs_exit + 2; + timing->da_hs_zero + timing->da_hs_exit + 3; if (dsi->mode_flags & MIPI_DSI_MODE_VIDEO_BURST) { - if (vm->hfront_porch * dsi_tmp_buf_bpp > + if ((vm->hfront_porch + vm->hback_porch) * dsi_tmp_buf_bpp > data_phy_cycles * dsi->lanes + 18) { - horizontal_frontporch_byte = vm->hfront_porch * - dsi_tmp_buf_bpp - - data_phy_cycles * - dsi->lanes - 18; + horizontal_frontporch_byte = + vm->hfront_porch * dsi_tmp_buf_bpp - + (data_phy_cycles * dsi->lanes + 18) * + vm->hfront_porch / + (vm->hfront_porch + vm->hback_porch); + + horizontal_backporch_byte = + horizontal_backporch_byte - + (data_phy_cycles * dsi->lanes + 18) * + vm->hback_porch / + (vm->hfront_porch + vm->hback_porch); } else { DRM_WARN("HFP less than d-phy, FPS will under 60Hz\n"); horizontal_frontporch_byte = vm->hfront_porch * dsi_tmp_buf_bpp; } } else { - if (vm->hfront_porch * dsi_tmp_buf_bpp > + if ((vm->hfront_porch + vm->hback_porch) * dsi_tmp_buf_bpp > data_phy_cycles * dsi->lanes + 12) { - horizontal_frontporch_byte = vm->hfront_porch * - dsi_tmp_buf_bpp - - data_phy_cycles * - dsi->lanes - 12; + horizontal_frontporch_byte = + vm->hfront_porch * dsi_tmp_buf_bpp - + (data_phy_cycles * dsi->lanes + 12) * + vm->hfront_porch / + (vm->hfront_porch + vm->hback_porch); + horizontal_backporch_byte = horizontal_backporch_byte - + (data_phy_cycles * dsi->lanes + 12) * + vm->hback_porch / + (vm->hfront_porch + vm->hback_porch); } else { DRM_WARN("HFP less than d-phy, FPS will under 60Hz\n"); horizontal_frontporch_byte = vm->hfront_porch * From 92adc96f8eecd9522a907c197cc3d62e405539fe Mon Sep 17 00:00:00 2001 From: Hui Wang Date: Wed, 18 Dec 2019 21:26:50 +0800 Subject: [PATCH 017/146] ALSA: usb-audio: set the interface format after resume on Dell WD19 Recently we found the headset-mic on the Dell Dock WD19 doesn't work anymore after s3 (s2i or deep), this problem could be workarounded by closing (pcm_close) the app and then reopening (pcm_open) the app, so this bug is not easy to be detected by users. When problem happens, retire_capture_urb() could still be called periodically, but the size of captured data is always 0, it could be a firmware bug on the dock. Anyway I found after resuming, the snd_usb_pcm_prepare() will be called, and if we forcibly run set_format() to set the interface and its endpoint, the capture size will be normal again. This problem and workaound also apply to playback. To fix it in the kernel, add a quirk to let set_format() run forcibly once after resume. Signed-off-by: Hui Wang Cc: Link: https://lore.kernel.org/r/20191218132650.6303-1-hui.wang@canonical.com Signed-off-by: Takashi Iwai --- sound/usb/card.h | 1 + sound/usb/pcm.c | 21 +++++++++++++++++++-- sound/usb/quirks-table.h | 3 ++- sound/usb/quirks.c | 11 +++++++++++ sound/usb/usbaudio.h | 3 ++- 5 files changed, 35 insertions(+), 4 deletions(-) diff --git a/sound/usb/card.h b/sound/usb/card.h index 2991b9986f66..395403a2d33f 100644 --- a/sound/usb/card.h +++ b/sound/usb/card.h @@ -145,6 +145,7 @@ struct snd_usb_substream { struct snd_usb_endpoint *sync_endpoint; unsigned long flags; bool need_setup_ep; /* (re)configure EP at prepare? */ + bool need_setup_fmt; /* (re)configure fmt after resume? */ unsigned int speed; /* USB_SPEED_XXX */ u64 formats; /* format bitmasks (all or'ed) */ diff --git a/sound/usb/pcm.c b/sound/usb/pcm.c index 9c8930bb00c8..96298c767c76 100644 --- a/sound/usb/pcm.c +++ b/sound/usb/pcm.c @@ -510,11 +510,11 @@ static int set_format(struct snd_usb_substream *subs, struct audioformat *fmt) if (WARN_ON(altsd->bAlternateSetting != fmt->altsetting)) return -EINVAL; - if (fmt == subs->cur_audiofmt) + if (fmt == subs->cur_audiofmt && !subs->need_setup_fmt) return 0; /* close the old interface */ - if (subs->interface >= 0 && subs->interface != fmt->iface) { + if (subs->interface >= 0 && (subs->interface != fmt->iface || subs->need_setup_fmt)) { if (!subs->stream->chip->keep_iface) { err = usb_set_interface(subs->dev, subs->interface, 0); if (err < 0) { @@ -528,6 +528,9 @@ static int set_format(struct snd_usb_substream *subs, struct audioformat *fmt) subs->altset_idx = 0; } + if (subs->need_setup_fmt) + subs->need_setup_fmt = false; + /* set interface */ if (iface->cur_altsetting != alts) { err = snd_usb_select_mode_quirk(subs, fmt); @@ -1728,6 +1731,13 @@ static int snd_usb_substream_playback_trigger(struct snd_pcm_substream *substrea subs->data_endpoint->retire_data_urb = retire_playback_urb; subs->running = 0; return 0; + case SNDRV_PCM_TRIGGER_SUSPEND: + if (subs->stream->chip->setup_fmt_after_resume_quirk) { + stop_endpoints(subs, true); + subs->need_setup_fmt = true; + return 0; + } + break; } return -EINVAL; @@ -1760,6 +1770,13 @@ static int snd_usb_substream_capture_trigger(struct snd_pcm_substream *substream subs->data_endpoint->retire_data_urb = retire_capture_urb; subs->running = 1; return 0; + case SNDRV_PCM_TRIGGER_SUSPEND: + if (subs->stream->chip->setup_fmt_after_resume_quirk) { + stop_endpoints(subs, true); + subs->need_setup_fmt = true; + return 0; + } + break; } return -EINVAL; diff --git a/sound/usb/quirks-table.h b/sound/usb/quirks-table.h index 70c338f3ae24..d187aa6d50db 100644 --- a/sound/usb/quirks-table.h +++ b/sound/usb/quirks-table.h @@ -3466,7 +3466,8 @@ AU0828_DEVICE(0x2040, 0x7270, "Hauppauge", "HVR-950Q"), .vendor_name = "Dell", .product_name = "WD19 Dock", .profile_name = "Dell-WD15-Dock", - .ifnum = QUIRK_NO_INTERFACE + .ifnum = QUIRK_ANY_INTERFACE, + .type = QUIRK_SETUP_FMT_AFTER_RESUME } }, /* MOTU Microbook II */ diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index 349e1e52996d..a81c2066499f 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -508,6 +508,16 @@ static int create_standard_mixer_quirk(struct snd_usb_audio *chip, return snd_usb_create_mixer(chip, quirk->ifnum, 0); } + +static int setup_fmt_after_resume_quirk(struct snd_usb_audio *chip, + struct usb_interface *iface, + struct usb_driver *driver, + const struct snd_usb_audio_quirk *quirk) +{ + chip->setup_fmt_after_resume_quirk = 1; + return 1; /* Continue with creating streams and mixer */ +} + /* * audio-interface quirks * @@ -546,6 +556,7 @@ int snd_usb_create_quirk(struct snd_usb_audio *chip, [QUIRK_AUDIO_EDIROL_UAXX] = create_uaxx_quirk, [QUIRK_AUDIO_ALIGN_TRANSFER] = create_align_transfer_quirk, [QUIRK_AUDIO_STANDARD_MIXER] = create_standard_mixer_quirk, + [QUIRK_SETUP_FMT_AFTER_RESUME] = setup_fmt_after_resume_quirk, }; if (quirk->type < QUIRK_TYPE_COUNT) { diff --git a/sound/usb/usbaudio.h b/sound/usb/usbaudio.h index ff3cbf653de8..6fe3ab582ec6 100644 --- a/sound/usb/usbaudio.h +++ b/sound/usb/usbaudio.h @@ -33,7 +33,7 @@ struct snd_usb_audio { wait_queue_head_t shutdown_wait; unsigned int txfr_quirk:1; /* Subframe boundaries on transfers */ unsigned int tx_length_quirk:1; /* Put length specifier in transfers */ - + unsigned int setup_fmt_after_resume_quirk:1; /* setup the format to interface after resume */ int num_interfaces; int num_suspended_intf; int sample_rate_read_error; @@ -98,6 +98,7 @@ enum quirk_type { QUIRK_AUDIO_EDIROL_UAXX, QUIRK_AUDIO_ALIGN_TRANSFER, QUIRK_AUDIO_STANDARD_MIXER, + QUIRK_SETUP_FMT_AFTER_RESUME, QUIRK_TYPE_COUNT }; From 57177d214ee0816c4436c23d6c933ccb32c571f1 Mon Sep 17 00:00:00 2001 From: Stefan Mavrodiev Date: Tue, 17 Dec 2019 14:46:32 +0200 Subject: [PATCH 018/146] drm/sun4i: hdmi: Remove duplicate cleanup calls When the HDMI unbinds drm_connector_cleanup() and drm_encoder_cleanup() are called. This also happens when the connector and the encoder are destroyed. This double call triggers a NULL pointer exception. The patch fixes this by removing the cleanup calls in the unbind function. Cc: Fixes: 9c5681011a0c ("drm/sun4i: Add HDMI support") Signed-off-by: Stefan Mavrodiev Signed-off-by: Maxime Ripard Link: https://patchwork.freedesktop.org/patch/msgid/20191217124632.20820-1-stefan@olimex.com --- drivers/gpu/drm/sun4i/sun4i_hdmi_enc.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/gpu/drm/sun4i/sun4i_hdmi_enc.c b/drivers/gpu/drm/sun4i/sun4i_hdmi_enc.c index a7c4654445c7..68d4644ac2dc 100644 --- a/drivers/gpu/drm/sun4i/sun4i_hdmi_enc.c +++ b/drivers/gpu/drm/sun4i/sun4i_hdmi_enc.c @@ -685,8 +685,6 @@ static void sun4i_hdmi_unbind(struct device *dev, struct device *master, struct sun4i_hdmi *hdmi = dev_get_drvdata(dev); cec_unregister_adapter(hdmi->cec_adap); - drm_connector_cleanup(&hdmi->connector); - drm_encoder_cleanup(&hdmi->encoder); i2c_del_adapter(hdmi->i2c); i2c_put_adapter(hdmi->ddc_i2c); clk_disable_unprepare(hdmi->mod_clk); From a4a3893114a41e365274d5fab5d9ff5acc235ff0 Mon Sep 17 00:00:00 2001 From: Jouni Hogander Date: Mon, 9 Dec 2019 14:37:07 +0200 Subject: [PATCH 019/146] MIPS: Prevent link failure with kcov instrumentation __sanitizer_cov_trace_pc() is not linked in and causing link failure if KCOV_INSTRUMENT is enabled. Fix this by disabling instrumentation for compressed image. Signed-off-by: Jouni Hogander Signed-off-by: Paul Burton Cc: Lukas Bulwahn Cc: linux-mips@vger.kernel.org --- arch/mips/boot/compressed/Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/mips/boot/compressed/Makefile b/arch/mips/boot/compressed/Makefile index 172801ed35b8..d859f079b771 100644 --- a/arch/mips/boot/compressed/Makefile +++ b/arch/mips/boot/compressed/Makefile @@ -29,6 +29,9 @@ KBUILD_AFLAGS := $(KBUILD_AFLAGS) -D__ASSEMBLY__ \ -DBOOT_HEAP_SIZE=$(BOOT_HEAP_SIZE) \ -DKERNEL_ENTRY=$(VMLINUX_ENTRY_ADDRESS) +# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in. +KCOV_INSTRUMENT := n + # decompressor objects (linked with vmlinuz) vmlinuzobjs-y := $(obj)/head.o $(obj)/decompress.o $(obj)/string.o From f8fffebdea752a25757b906f3dffecf1a59a6194 Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Thu, 5 Dec 2019 10:23:18 -0800 Subject: [PATCH 020/146] MIPS: BPF: Disable MIPS32 eBPF JIT Commit 716850ab104d ("MIPS: eBPF: Initial eBPF support for MIPS32 architecture.") enabled our eBPF JIT for MIPS32 kernels, whereas it has previously only been availailable for MIPS64. It was my understanding at the time that the BPF test suite was passing & JITing a comparable number of tests to our cBPF JIT [1], but it turns out that was not the case. The eBPF JIT has a number of problems on MIPS32: - Most notably various code paths still result in emission of MIPS64 instructions which will cause reserved instruction exceptions & kernel panics when run on MIPS32 CPUs. - The eBPF JIT doesn't account for differences between the O32 ABI used by MIPS32 kernels versus the N64 ABI used by MIPS64 kernels. Notably arguments beyond the first 4 are passed on the stack in O32, and this is entirely unhandled when JITing a BPF_CALL instruction. Stack space must be reserved for arguments even if they all fit in registers, and the callee is free to assume that stack space has been reserved for its use - with the eBPF JIT this is not the case, so calling any function can result in clobbering values on the stack & unpredictable behaviour. Function arguments in eBPF are always 64-bit values which is also entirely unhandled - the JIT still uses a single (32-bit) register per argument. As a result all function arguments are always passed incorrectly when JITing a BPF_CALL instruction, leading to kernel crashes or strange behavior. - The JIT attempts to bail our on use of ALU64 instructions or 64-bit memory access instructions. The code doing this at the start of build_one_insn() incorrectly checks whether BPF_OP() equals BPF_DW, when it should really be checking BPF_SIZE() & only doing so when BPF_CLASS() is one of BPF_{LD,LDX,ST,STX}. This results in false positives that cause more bailouts than intended, and that in turns hides some of the problems described above. - The kernel's cBPF->eBPF translation makes heavy use of 64-bit eBPF instructions that the MIPS32 eBPF JIT bails out on, leading to most cBPF programs not being JITed at all. Until these problems are resolved, revert the enabling of the eBPF JIT on MIPS32 done by commit 716850ab104d ("MIPS: eBPF: Initial eBPF support for MIPS32 architecture."). Note that this does not undo the changes made to the eBPF JIT by that commit, since they are a useful starting point to providing MIPS32 support - they're just not nearly complete. [1] https://lore.kernel.org/linux-mips/MWHPR2201MB13583388481F01A422CE7D66D4410@MWHPR2201MB1358.namprd22.prod.outlook.com/ Signed-off-by: Paul Burton Fixes: 716850ab104d ("MIPS: eBPF: Initial eBPF support for MIPS32 architecture.") Cc: Daniel Borkmann Cc: Hassan Naveed Cc: Tony Ambardar Cc: bpf@vger.kernel.org Cc: netdev@vger.kernel.org Cc: # v5.2+ Cc: linux-mips@vger.kernel.org Cc: linux-kernel@vger.kernel.org --- arch/mips/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index c86be02b6d89..90ee2ed463a8 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -47,7 +47,7 @@ config MIPS select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE if CPU_SUPPORTS_HUGEPAGES select HAVE_ASM_MODVERSIONS - select HAVE_EBPF_JIT if (!CPU_MICROMIPS) + select HAVE_EBPF_JIT if (64BIT && !CPU_MICROMIPS) select HAVE_CONTEXT_TRACKING select HAVE_COPY_THREAD_TLS select HAVE_C_RECORDMCOUNT From f596cf0d8062cb5d0a4513a8b3afca318c13be10 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Fri, 6 Dec 2019 11:07:41 +0300 Subject: [PATCH 021/146] MIPS: BPF: eBPF JIT: check for MIPS ISA compliance in Kconfig It is completely wrong to check for compile-time MIPS ISA revision in the body of bpf_int_jit_compile() as it may lead to get MIPS JIT fully omitted by the CC while the rest system will think that the JIT is actually present and works [1]. We can check if the selected CPU really supports MIPS eBPF JIT at configure time and avoid such situations when kernel can be built without both JIT and interpreter, but with CONFIG_BPF_SYSCALL=y. [1] https://lore.kernel.org/linux-mips/09d713a59665d745e21d021deeaebe0a@dlink.ru/ Fixes: 716850ab104d ("MIPS: eBPF: Initial eBPF support for MIPS32 architecture.") Cc: # v5.2+ Signed-off-by: Alexander Lobakin Signed-off-by: Paul Burton Cc: Ralf Baechle Cc: James Hogan Cc: Hassan Naveed Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Martin KaFai Lau Cc: Song Liu Cc: Yonghong Song Cc: Andrii Nakryiko Cc: linux-mips@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: netdev@vger.kernel.org Cc: bpf@vger.kernel.org --- arch/mips/Kconfig | 2 +- arch/mips/net/ebpf_jit.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 90ee2ed463a8..827bbda105f3 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -47,7 +47,7 @@ config MIPS select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE if CPU_SUPPORTS_HUGEPAGES select HAVE_ASM_MODVERSIONS - select HAVE_EBPF_JIT if (64BIT && !CPU_MICROMIPS) + select HAVE_EBPF_JIT if 64BIT && !CPU_MICROMIPS && TARGET_ISA_REV >= 2 select HAVE_CONTEXT_TRACKING select HAVE_COPY_THREAD_TLS select HAVE_C_RECORDMCOUNT diff --git a/arch/mips/net/ebpf_jit.c b/arch/mips/net/ebpf_jit.c index 46b76751f3a5..a2405d5f7d1e 100644 --- a/arch/mips/net/ebpf_jit.c +++ b/arch/mips/net/ebpf_jit.c @@ -1803,7 +1803,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) unsigned int image_size; u8 *image_ptr; - if (!prog->jit_requested || MIPS_ISA_REV < 2) + if (!prog->jit_requested) return prog; tmp = bpf_jit_blind_constants(prog); From 66c5d718e5a6f80153b5e8d6ad8ba8e9c3320839 Mon Sep 17 00:00:00 2001 From: Kailang Yang Date: Mon, 9 Dec 2019 15:56:15 +0800 Subject: [PATCH 022/146] ALSA: hda/realtek - Add headset Mic no shutup for ALC283 Chrome machine had humming noise from external speaker plugin at codec D3 state. Signed-off-by: Kailang Yang Cc: Link: https://lore.kernel.org/r/2692449396954c6c968f5b75e2660358@realtek.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index dbfafee97931..5bc1a6d24333 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -501,6 +501,7 @@ static void alc_shutup_pins(struct hda_codec *codec) struct alc_spec *spec = codec->spec; switch (codec->core.vendor_id) { + case 0x10ec0283: case 0x10ec0286: case 0x10ec0288: case 0x10ec0298: From c9b3b8207bc487de02cbca968927ba2c2cb46aaf Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 10 Dec 2019 21:24:28 +0100 Subject: [PATCH 023/146] netfilter: nf_flow_table: fix big-endian integer overflow In some configurations, gcc reports an integer overflow: net/netfilter/nf_flow_table_offload.c: In function 'nf_flow_rule_match': net/netfilter/nf_flow_table_offload.c:80:21: error: unsigned conversion from 'int' to '__be16' {aka 'short unsigned int'} changes value from '327680' to '0' [-Werror=overflow] mask->tcp.flags = TCP_FLAG_RST | TCP_FLAG_FIN; ^~~~~~~~~~~~ From what I can tell, we want the upper 16 bits of these constants, so they need to be shifted in cpu-endian mode. Fixes: c29f74e0df7a ("netfilter: nf_flow_table: hardware offload support") Signed-off-by: Arnd Bergmann Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_offload.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index de7a0d1e15c8..0d72e5ccb47b 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -88,7 +88,7 @@ static int nf_flow_rule_match(struct nf_flow_match *match, switch (tuple->l4proto) { case IPPROTO_TCP: key->tcp.flags = 0; - mask->tcp.flags = TCP_FLAG_RST | TCP_FLAG_FIN; + mask->tcp.flags = cpu_to_be16(be32_to_cpu(TCP_FLAG_RST | TCP_FLAG_FIN) >> 16); match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_TCP); break; case IPPROTO_UDP: From d05d5db815d56a0ce203ed297153d9794dfdcb68 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 13 Dec 2019 01:19:58 +0100 Subject: [PATCH 024/146] selftests: netfilter: extend flowtable test script with dnat rule NAT test currently covers snat (masquerade) only. Also add a dnat rule and then check that a connecting to the to-be-dnated address will work. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- .../selftests/netfilter/nft_flowtable.sh | 39 ++++++++++++++++--- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/netfilter/nft_flowtable.sh b/tools/testing/selftests/netfilter/nft_flowtable.sh index 16571ac1dab4..d3e0809ab368 100755 --- a/tools/testing/selftests/netfilter/nft_flowtable.sh +++ b/tools/testing/selftests/netfilter/nft_flowtable.sh @@ -226,17 +226,19 @@ check_transfer() return 0 } -test_tcp_forwarding() +test_tcp_forwarding_ip() { local nsa=$1 local nsb=$2 + local dstip=$3 + local dstport=$4 local lret=0 ip netns exec $nsb nc -w 5 -l -p 12345 < "$ns2in" > "$ns2out" & lpid=$! sleep 1 - ip netns exec $nsa nc -w 4 10.0.2.99 12345 < "$ns1in" > "$ns1out" & + ip netns exec $nsa nc -w 4 "$dstip" "$dstport" < "$ns1in" > "$ns1out" & cpid=$! sleep 3 @@ -258,6 +260,28 @@ test_tcp_forwarding() return $lret } +test_tcp_forwarding() +{ + test_tcp_forwarding_ip "$1" "$2" 10.0.2.99 12345 + + return $? +} + +test_tcp_forwarding_nat() +{ + local lret + + test_tcp_forwarding_ip "$1" "$2" 10.0.2.99 12345 + lret=$? + + if [ $lret -eq 0 ] ; then + test_tcp_forwarding_ip "$1" "$2" 10.6.6.6 1666 + lret=$? + fi + + return $lret +} + make_file "$ns1in" "ns1" make_file "$ns2in" "ns2" @@ -283,14 +307,19 @@ ip -net ns2 route add 192.168.10.1 via 10.0.2.1 # Same, but with NAT enabled. ip netns exec nsr1 nft -f - < /dev/null ip netns exec ns2 sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null -test_tcp_forwarding ns1 ns2 +test_tcp_forwarding_nat ns1 ns2 if [ $? -eq 0 ] ;then echo "PASS: flow offloaded for ns1/ns2 with NAT and pmtu discovery" else From e608f631f0ba5f1fc5ee2e260a3a35d13107cbfe Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 15 Dec 2019 03:49:25 +0100 Subject: [PATCH 025/146] netfilter: ebtables: compat: reject all padding in matches/watchers syzbot reported following splat: BUG: KASAN: vmalloc-out-of-bounds in size_entry_mwt net/bridge/netfilter/ebtables.c:2063 [inline] BUG: KASAN: vmalloc-out-of-bounds in compat_copy_entries+0x128b/0x1380 net/bridge/netfilter/ebtables.c:2155 Read of size 4 at addr ffffc900004461f4 by task syz-executor267/7937 CPU: 1 PID: 7937 Comm: syz-executor267 Not tainted 5.5.0-rc1-syzkaller #0 size_entry_mwt net/bridge/netfilter/ebtables.c:2063 [inline] compat_copy_entries+0x128b/0x1380 net/bridge/netfilter/ebtables.c:2155 compat_do_replace+0x344/0x720 net/bridge/netfilter/ebtables.c:2249 compat_do_ebt_set_ctl+0x22f/0x27e net/bridge/netfilter/ebtables.c:2333 [..] Because padding isn't considered during computation of ->buf_user_offset, "total" is decremented by fewer bytes than it should. Therefore, the first part of if (*total < sizeof(*entry) || entry->next_offset < sizeof(*entry)) will pass, -- it should not have. This causes oob access: entry->next_offset is past the vmalloced size. Reject padding and check that computed user offset (sum of ebt_entry structure plus all individual matches/watchers/targets) is same value that userspace gave us as the offset of the next entry. Reported-by: syzbot+f68108fed972453a0ad4@syzkaller.appspotmail.com Fixes: 81e675c227ec ("netfilter: ebtables: add CONFIG_COMPAT support") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/ebtables.c | 35 ++++++++++++++++----------------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 4096d8a74a2b..e1256e03a9a8 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1867,7 +1867,7 @@ static int ebt_buf_count(struct ebt_entries_buf_state *state, unsigned int sz) } static int ebt_buf_add(struct ebt_entries_buf_state *state, - void *data, unsigned int sz) + const void *data, unsigned int sz) { if (state->buf_kern_start == NULL) goto count_only; @@ -1901,7 +1901,7 @@ enum compat_mwt { EBT_COMPAT_TARGET, }; -static int compat_mtw_from_user(struct compat_ebt_entry_mwt *mwt, +static int compat_mtw_from_user(const struct compat_ebt_entry_mwt *mwt, enum compat_mwt compat_mwt, struct ebt_entries_buf_state *state, const unsigned char *base) @@ -1979,22 +1979,23 @@ static int compat_mtw_from_user(struct compat_ebt_entry_mwt *mwt, /* return size of all matches, watchers or target, including necessary * alignment and padding. */ -static int ebt_size_mwt(struct compat_ebt_entry_mwt *match32, +static int ebt_size_mwt(const struct compat_ebt_entry_mwt *match32, unsigned int size_left, enum compat_mwt type, struct ebt_entries_buf_state *state, const void *base) { + const char *buf = (const char *)match32; int growth = 0; - char *buf; if (size_left == 0) return 0; - buf = (char *) match32; - - while (size_left >= sizeof(*match32)) { + do { struct ebt_entry_match *match_kern; int ret; + if (size_left < sizeof(*match32)) + return -EINVAL; + match_kern = (struct ebt_entry_match *) state->buf_kern_start; if (match_kern) { char *tmp; @@ -2031,22 +2032,18 @@ static int ebt_size_mwt(struct compat_ebt_entry_mwt *match32, if (match_kern) match_kern->match_size = ret; - /* rule should have no remaining data after target */ - if (type == EBT_COMPAT_TARGET && size_left) - return -EINVAL; - match32 = (struct compat_ebt_entry_mwt *) buf; - } + } while (size_left); return growth; } /* called for all ebt_entry structures. */ -static int size_entry_mwt(struct ebt_entry *entry, const unsigned char *base, +static int size_entry_mwt(const struct ebt_entry *entry, const unsigned char *base, unsigned int *total, struct ebt_entries_buf_state *state) { - unsigned int i, j, startoff, new_offset = 0; + unsigned int i, j, startoff, next_expected_off, new_offset = 0; /* stores match/watchers/targets & offset of next struct ebt_entry: */ unsigned int offsets[4]; unsigned int *offsets_update = NULL; @@ -2132,11 +2129,13 @@ static int size_entry_mwt(struct ebt_entry *entry, const unsigned char *base, return ret; } - startoff = state->buf_user_offset - startoff; - - if (WARN_ON(*total < startoff)) + next_expected_off = state->buf_user_offset - startoff; + if (next_expected_off != entry->next_offset) return -EINVAL; - *total -= startoff; + + if (*total < entry->next_offset) + return -EINVAL; + *total -= entry->next_offset; return 0; } From 8cb4ec44de42b99b92399b4d1daf3dc430ed0186 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Wed, 18 Dec 2019 00:59:29 +0100 Subject: [PATCH 026/146] netfilter: nft_tproxy: Fix port selector on Big Endian MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On Big Endian architectures, u16 port value was extracted from the wrong parts of u32 sreg_port, just like commit 10596608c4d62 ("netfilter: nf_tables: fix mismatch in big-endian system") describes. Fixes: 4ed8eb6570a49 ("netfilter: nf_tables: Add native tproxy support") Signed-off-by: Phil Sutter Acked-by: Florian Westphal Acked-by: Máté Eckl Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_tproxy.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c index f92a82c73880..95980154ef02 100644 --- a/net/netfilter/nft_tproxy.c +++ b/net/netfilter/nft_tproxy.c @@ -50,7 +50,7 @@ static void nft_tproxy_eval_v4(const struct nft_expr *expr, taddr = nf_tproxy_laddr4(skb, taddr, iph->daddr); if (priv->sreg_port) - tport = regs->data[priv->sreg_port]; + tport = nft_reg_load16(®s->data[priv->sreg_port]); if (!tport) tport = hp->dest; @@ -117,7 +117,7 @@ static void nft_tproxy_eval_v6(const struct nft_expr *expr, taddr = *nf_tproxy_laddr6(skb, &taddr, &iph->daddr); if (priv->sreg_port) - tport = regs->data[priv->sreg_port]; + tport = nft_reg_load16(®s->data[priv->sreg_port]); if (!tport) tport = hp->dest; From 0141254b0a74b37aa7eb13d42a56adba84d51c73 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Fri, 20 Dec 2019 10:31:34 +0100 Subject: [PATCH 027/146] ALSA: usb-audio: fix set_format altsetting sanity check Make sure to check the return value of usb_altnum_to_altsetting() to avoid dereferencing a NULL pointer when the requested alternate settings is missing. The format altsetting number may come from a quirk table and there does not seem to be any other validation of it (the corresponding index is checked however). Fixes: b099b9693d23 ("ALSA: usb-audio: Avoid superfluous usb_set_interface() calls") Cc: stable # 4.18 Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20191220093134.1248-1-johan@kernel.org Signed-off-by: Takashi Iwai --- sound/usb/pcm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/usb/pcm.c b/sound/usb/pcm.c index 96298c767c76..a11c8150af58 100644 --- a/sound/usb/pcm.c +++ b/sound/usb/pcm.c @@ -506,9 +506,9 @@ static int set_format(struct snd_usb_substream *subs, struct audioformat *fmt) if (WARN_ON(!iface)) return -EINVAL; alts = usb_altnum_to_altsetting(iface, fmt->altsetting); - altsd = get_iface_desc(alts); - if (WARN_ON(altsd->bAlternateSetting != fmt->altsetting)) + if (WARN_ON(!alts)) return -EINVAL; + altsd = get_iface_desc(alts); if (fmt == subs->cur_audiofmt && !subs->need_setup_fmt) return 0; From ac2917b01992c098b8d4e6837115e3ca347fdd90 Mon Sep 17 00:00:00 2001 From: "Ben Dooks (Codethink)" Date: Tue, 17 Dec 2019 11:53:09 +0000 Subject: [PATCH 028/146] drm/arm/mali: make malidp_mw_connector_helper_funcs static The malidp_mw_connector_helper_funcs is not referenced by name outside of the file it is in, so make it static to avoid the following warning: drivers/gpu/drm/arm/malidp_mw.c:59:41: warning: symbol 'malidp_mw_connector_helper_funcs' was not declared. Should it be static? Signed-off-by: Ben Dooks (Codethink) Signed-off-by: Liviu Dudau Link: https://patchwork.freedesktop.org/patch/msgid/20191217115309.2133503-1-ben.dooks@codethink.co.uk --- drivers/gpu/drm/arm/malidp_mw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/arm/malidp_mw.c b/drivers/gpu/drm/arm/malidp_mw.c index 875a3a9eabfa..7d0e7b031e44 100644 --- a/drivers/gpu/drm/arm/malidp_mw.c +++ b/drivers/gpu/drm/arm/malidp_mw.c @@ -56,7 +56,7 @@ malidp_mw_connector_mode_valid(struct drm_connector *connector, return MODE_OK; } -const struct drm_connector_helper_funcs malidp_mw_connector_helper_funcs = { +static const struct drm_connector_helper_funcs malidp_mw_connector_helper_funcs = { .get_modes = malidp_mw_connector_get_modes, .mode_valid = malidp_mw_connector_mode_valid, }; From f33121cbe91973a08e68e4bde8c3f7e6e4e351c1 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 18 Dec 2019 16:38:49 +0000 Subject: [PATCH 029/146] rxrpc: Unlock new call in rxrpc_new_incoming_call() rather than the caller Move the unlock and the ping transmission for a new incoming call into rxrpc_new_incoming_call() rather than doing it in the caller. This makes it clearer to see what's going on. Suggested-by: Peter Zijlstra Signed-off-by: David Howells Acked-by: Peter Zijlstra (Intel) cc: Ingo Molnar cc: Will Deacon cc: Davidlohr Bueso --- net/rxrpc/call_accept.c | 36 ++++++++++++++++++++++++++++-------- net/rxrpc/input.c | 18 ------------------ 2 files changed, 28 insertions(+), 26 deletions(-) diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 135bf5cd8dd5..3685b1732f65 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -239,6 +239,22 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx) kfree(b); } +/* + * Ping the other end to fill our RTT cache and to retrieve the rwind + * and MTU parameters. + */ +static void rxrpc_send_ping(struct rxrpc_call *call, struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + ktime_t now = skb->tstamp; + + if (call->peer->rtt_usage < 3 || + ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), now)) + rxrpc_propose_ACK(call, RXRPC_ACK_PING, sp->hdr.serial, + true, true, + rxrpc_propose_ack_ping_for_params); +} + /* * Allocate a new incoming call from the prealloc pool, along with a connection * and a peer as necessary. @@ -346,9 +362,7 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, sp->hdr.seq, RX_INVALID_OPERATION, ESHUTDOWN); skb->mark = RXRPC_SKB_MARK_REJECT_ABORT; skb->priority = RX_INVALID_OPERATION; - _leave(" = NULL [close]"); - call = NULL; - goto out; + goto no_call; } /* The peer, connection and call may all have sprung into existence due @@ -361,9 +375,7 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, call = rxrpc_alloc_incoming_call(rx, local, peer, conn, skb); if (!call) { skb->mark = RXRPC_SKB_MARK_REJECT_BUSY; - _leave(" = NULL [busy]"); - call = NULL; - goto out; + goto no_call; } trace_rxrpc_receive(call, rxrpc_receive_incoming, @@ -432,10 +444,18 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, */ rxrpc_put_call(call, rxrpc_call_put); - _leave(" = %p{%d}", call, call->debug_id); -out: spin_unlock(&rx->incoming_lock); + + rxrpc_send_ping(call, skb); + mutex_unlock(&call->user_mutex); + + _leave(" = %p{%d}", call, call->debug_id); return call; + +no_call: + spin_unlock(&rx->incoming_lock); + _leave(" = NULL [%u]", skb->mark); + return NULL; } /* diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 157be1ff8697..86bd133b4fa0 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -192,22 +192,6 @@ static void rxrpc_congestion_management(struct rxrpc_call *call, goto out_no_clear_ca; } -/* - * Ping the other end to fill our RTT cache and to retrieve the rwind - * and MTU parameters. - */ -static void rxrpc_send_ping(struct rxrpc_call *call, struct sk_buff *skb) -{ - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - ktime_t now = skb->tstamp; - - if (call->peer->rtt_usage < 3 || - ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), now)) - rxrpc_propose_ACK(call, RXRPC_ACK_PING, sp->hdr.serial, - true, true, - rxrpc_propose_ack_ping_for_params); -} - /* * Apply a hard ACK by advancing the Tx window. */ @@ -1396,8 +1380,6 @@ int rxrpc_input_packet(struct sock *udp_sk, struct sk_buff *skb) call = rxrpc_new_incoming_call(local, rx, skb); if (!call) goto reject_packet; - rxrpc_send_ping(call, skb); - mutex_unlock(&call->user_mutex); } /* Process a call packet; this either discards or passes on the ref From 13b7955a0252e15265386b229b814152f109b234 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 20 Dec 2019 16:20:56 +0000 Subject: [PATCH 030/146] rxrpc: Don't take call->user_mutex in rxrpc_new_incoming_call() Standard kernel mutexes cannot be used in any way from interrupt or softirq context, so the user_mutex which manages access to a call cannot be a mutex since on a new call the mutex must start off locked and be unlocked within the softirq handler to prevent userspace interfering with a call we're setting up. Commit a0855d24fc22d49cdc25664fb224caee16998683 ("locking/mutex: Complain upon mutex API misuse in IRQ contexts") causes big warnings to be splashed in dmesg for each a new call that comes in from the server. Whilst it *seems* like it should be okay, since the accept path uses trylock, there are issues with PI boosting and marking the wrong task as the owner. Fix this by not taking the mutex in the softirq path at all. It's not obvious that there should be any need for it as the state is set before the first notification is generated for the new call. There's also no particular reason why the link-assessing ping should be triggered inside the mutex. It's not actually transmitted there anyway, but rather it has to be deferred to a workqueue. Further, I don't think that there's any particular reason that the socket notification needs to be done from within rx->incoming_lock, so the amount of time that lock is held can be shortened too and the ping prepared before the new call notification is sent. Fixes: 540b1c48c37a ("rxrpc: Fix deadlock between call creation and sendmsg/recvmsg") Signed-off-by: David Howells cc: Peter Zijlstra (Intel) cc: Ingo Molnar cc: Will Deacon cc: Davidlohr Bueso --- net/rxrpc/call_accept.c | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 3685b1732f65..44fa22b020ef 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -381,18 +381,6 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, trace_rxrpc_receive(call, rxrpc_receive_incoming, sp->hdr.serial, sp->hdr.seq); - /* Lock the call to prevent rxrpc_kernel_send/recv_data() and - * sendmsg()/recvmsg() inconveniently stealing the mutex once the - * notification is generated. - * - * The BUG should never happen because the kernel should be well - * behaved enough not to access the call before the first notification - * event and userspace is prevented from doing so until the state is - * appropriate. - */ - if (!mutex_trylock(&call->user_mutex)) - BUG(); - /* Make the call live. */ rxrpc_incoming_call(rx, call, skb); conn = call->conn; @@ -433,6 +421,9 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, BUG(); } spin_unlock(&conn->state_lock); + spin_unlock(&rx->incoming_lock); + + rxrpc_send_ping(call, skb); if (call->state == RXRPC_CALL_SERVER_ACCEPTING) rxrpc_notify_socket(call); @@ -444,11 +435,6 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, */ rxrpc_put_call(call, rxrpc_call_put); - spin_unlock(&rx->incoming_lock); - - rxrpc_send_ping(call, skb); - mutex_unlock(&call->user_mutex); - _leave(" = %p{%d}", call, call->debug_id); return call; From 063c60d39180cec7c9317f5acfc3071f8fecd705 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 20 Dec 2019 16:17:16 +0000 Subject: [PATCH 031/146] rxrpc: Fix missing security check on incoming calls Fix rxrpc_new_incoming_call() to check that we have a suitable service key available for the combination of service ID and security class of a new incoming call - and to reject calls for which we don't. This causes an assertion like the following to appear: rxrpc: Assertion failed - 6(0x6) == 12(0xc) is false kernel BUG at net/rxrpc/call_object.c:456! Where call->state is RXRPC_CALL_SERVER_SECURING (6) rather than RXRPC_CALL_COMPLETE (12). Fixes: 248f219cb8bc ("rxrpc: Rewrite the data and ack handling code") Reported-by: Marc Dionne Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 10 ++++-- net/rxrpc/call_accept.c | 14 ++++++-- net/rxrpc/conn_event.c | 16 +-------- net/rxrpc/conn_service.c | 4 +++ net/rxrpc/rxkad.c | 5 +-- net/rxrpc/security.c | 70 +++++++++++++++++++--------------------- 6 files changed, 59 insertions(+), 60 deletions(-) diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 7c7d10f2e0c1..5e99df80e80a 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -209,6 +209,7 @@ struct rxrpc_skb_priv { struct rxrpc_security { const char *name; /* name of this service */ u8 security_index; /* security type provided */ + u32 no_key_abort; /* Abort code indicating no key */ /* Initialise a security service */ int (*init)(void); @@ -977,8 +978,9 @@ static inline void rxrpc_reduce_conn_timer(struct rxrpc_connection *conn, struct rxrpc_connection *rxrpc_find_service_conn_rcu(struct rxrpc_peer *, struct sk_buff *); struct rxrpc_connection *rxrpc_prealloc_service_connection(struct rxrpc_net *, gfp_t); -void rxrpc_new_incoming_connection(struct rxrpc_sock *, - struct rxrpc_connection *, struct sk_buff *); +void rxrpc_new_incoming_connection(struct rxrpc_sock *, struct rxrpc_connection *, + const struct rxrpc_security *, struct key *, + struct sk_buff *); void rxrpc_unpublish_service_conn(struct rxrpc_connection *); /* @@ -1103,7 +1105,9 @@ extern const struct rxrpc_security rxkad; int __init rxrpc_init_security(void); void rxrpc_exit_security(void); int rxrpc_init_client_conn_security(struct rxrpc_connection *); -int rxrpc_init_server_conn_security(struct rxrpc_connection *); +bool rxrpc_look_up_server_security(struct rxrpc_local *, struct rxrpc_sock *, + const struct rxrpc_security **, struct key **, + struct sk_buff *); /* * sendmsg.c diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 44fa22b020ef..70e44abf106c 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -263,6 +263,8 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, struct rxrpc_local *local, struct rxrpc_peer *peer, struct rxrpc_connection *conn, + const struct rxrpc_security *sec, + struct key *key, struct sk_buff *skb) { struct rxrpc_backlog *b = rx->backlog; @@ -310,7 +312,7 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, conn->params.local = rxrpc_get_local(local); conn->params.peer = peer; rxrpc_see_connection(conn); - rxrpc_new_incoming_connection(rx, conn, skb); + rxrpc_new_incoming_connection(rx, conn, sec, key, skb); } else { rxrpc_get_connection(conn); } @@ -349,9 +351,11 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + const struct rxrpc_security *sec = NULL; struct rxrpc_connection *conn; struct rxrpc_peer *peer = NULL; - struct rxrpc_call *call; + struct rxrpc_call *call = NULL; + struct key *key = NULL; _enter(""); @@ -372,7 +376,11 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, */ conn = rxrpc_find_connection_rcu(local, skb, &peer); - call = rxrpc_alloc_incoming_call(rx, local, peer, conn, skb); + if (!conn && !rxrpc_look_up_server_security(local, rx, &sec, &key, skb)) + goto no_call; + + call = rxrpc_alloc_incoming_call(rx, local, peer, conn, sec, key, skb); + key_put(key); if (!call) { skb->mark = RXRPC_SKB_MARK_REJECT_BUSY; goto no_call; diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index a1ceef4f5cd0..808a4723f868 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -376,21 +376,7 @@ static void rxrpc_secure_connection(struct rxrpc_connection *conn) _enter("{%d}", conn->debug_id); ASSERT(conn->security_ix != 0); - - if (!conn->params.key) { - _debug("set up security"); - ret = rxrpc_init_server_conn_security(conn); - switch (ret) { - case 0: - break; - case -ENOENT: - abort_code = RX_CALL_DEAD; - goto abort; - default: - abort_code = RXKADNOAUTH; - goto abort; - } - } + ASSERT(conn->server_key); if (conn->security->issue_challenge(conn) < 0) { abort_code = RX_CALL_DEAD; diff --git a/net/rxrpc/conn_service.c b/net/rxrpc/conn_service.c index 123d6ceab15c..21da48e3d2e5 100644 --- a/net/rxrpc/conn_service.c +++ b/net/rxrpc/conn_service.c @@ -148,6 +148,8 @@ struct rxrpc_connection *rxrpc_prealloc_service_connection(struct rxrpc_net *rxn */ void rxrpc_new_incoming_connection(struct rxrpc_sock *rx, struct rxrpc_connection *conn, + const struct rxrpc_security *sec, + struct key *key, struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); @@ -160,6 +162,8 @@ void rxrpc_new_incoming_connection(struct rxrpc_sock *rx, conn->service_id = sp->hdr.serviceId; conn->security_ix = sp->hdr.securityIndex; conn->out_clientflag = 0; + conn->security = sec; + conn->server_key = key_get(key); if (conn->security_ix) conn->state = RXRPC_CONN_SERVICE_UNSECURED; else diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 8d8aa3c230b5..098f1f9ec53b 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -648,9 +648,9 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn) u32 serial; int ret; - _enter("{%d,%x}", conn->debug_id, key_serial(conn->params.key)); + _enter("{%d,%x}", conn->debug_id, key_serial(conn->server_key)); - ret = key_validate(conn->params.key); + ret = key_validate(conn->server_key); if (ret < 0) return ret; @@ -1293,6 +1293,7 @@ static void rxkad_exit(void) const struct rxrpc_security rxkad = { .name = "rxkad", .security_index = RXRPC_SECURITY_RXKAD, + .no_key_abort = RXKADUNKNOWNKEY, .init = rxkad_init, .exit = rxkad_exit, .init_connection_security = rxkad_init_connection_security, diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c index a4c47d2b7054..9b1fb9ed0717 100644 --- a/net/rxrpc/security.c +++ b/net/rxrpc/security.c @@ -101,62 +101,58 @@ int rxrpc_init_client_conn_security(struct rxrpc_connection *conn) } /* - * initialise the security on a server connection + * Find the security key for a server connection. */ -int rxrpc_init_server_conn_security(struct rxrpc_connection *conn) +bool rxrpc_look_up_server_security(struct rxrpc_local *local, struct rxrpc_sock *rx, + const struct rxrpc_security **_sec, + struct key **_key, + struct sk_buff *skb) { const struct rxrpc_security *sec; - struct rxrpc_local *local = conn->params.local; - struct rxrpc_sock *rx; - struct key *key; - key_ref_t kref; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + key_ref_t kref = NULL; char kdesc[5 + 1 + 3 + 1]; _enter(""); - sprintf(kdesc, "%u:%u", conn->service_id, conn->security_ix); + sprintf(kdesc, "%u:%u", sp->hdr.serviceId, sp->hdr.securityIndex); - sec = rxrpc_security_lookup(conn->security_ix); + sec = rxrpc_security_lookup(sp->hdr.securityIndex); if (!sec) { - _leave(" = -ENOKEY [lookup]"); - return -ENOKEY; + trace_rxrpc_abort(0, "SVS", + sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, + RX_INVALID_OPERATION, EKEYREJECTED); + skb->mark = RXRPC_SKB_MARK_REJECT_ABORT; + skb->priority = RX_INVALID_OPERATION; + return false; } - /* find the service */ - read_lock(&local->services_lock); - rx = rcu_dereference_protected(local->service, - lockdep_is_held(&local->services_lock)); - if (rx && (rx->srx.srx_service == conn->service_id || - rx->second_service == conn->service_id)) - goto found_service; + if (sp->hdr.securityIndex == RXRPC_SECURITY_NONE) + goto out; - /* the service appears to have died */ - read_unlock(&local->services_lock); - _leave(" = -ENOENT"); - return -ENOENT; - -found_service: if (!rx->securities) { - read_unlock(&local->services_lock); - _leave(" = -ENOKEY"); - return -ENOKEY; + trace_rxrpc_abort(0, "SVR", + sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, + RX_INVALID_OPERATION, EKEYREJECTED); + skb->mark = RXRPC_SKB_MARK_REJECT_ABORT; + skb->priority = RX_INVALID_OPERATION; + return false; } /* look through the service's keyring */ kref = keyring_search(make_key_ref(rx->securities, 1UL), &key_type_rxrpc_s, kdesc, true); if (IS_ERR(kref)) { - read_unlock(&local->services_lock); - _leave(" = %ld [search]", PTR_ERR(kref)); - return PTR_ERR(kref); + trace_rxrpc_abort(0, "SVK", + sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, + sec->no_key_abort, EKEYREJECTED); + skb->mark = RXRPC_SKB_MARK_REJECT_ABORT; + skb->priority = sec->no_key_abort; + return false; } - key = key_ref_to_ptr(kref); - read_unlock(&local->services_lock); - - conn->server_key = key; - conn->security = sec; - - _leave(" = 0"); - return 0; +out: + *_sec = sec; + *_key = key_ref_to_ptr(kref); + return true; } From 8cc0991c09bfd11fd878b0321a7a06724520d879 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 20 Dec 2019 19:17:02 -0500 Subject: [PATCH 032/146] ALSA: hda/hdmi - fix atpx_present when CLASS is not VGA You can't use PCI_BASE_CLASS with pci_get_class(). This happens to work by luck on devices with PCI_CLASS_DISPLAY_VGA, but misses PCI_CLASS_DISPLAY_OTHER. Add a check for those as well. Fixes: 586bc4aab878 ("ALSA: hda/hdmi - fix vgaswitcheroo detection for AMD") Signed-off-by: Alex Deucher Link: https://lore.kernel.org/r/20191221001702.1338587-1-alexander.deucher@amd.com Signed-off-by: Takashi Iwai --- sound/pci/hda/hda_intel.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c index b856b89378ac..f69c8de64bd6 100644 --- a/sound/pci/hda/hda_intel.c +++ b/sound/pci/hda/hda_intel.c @@ -1410,7 +1410,17 @@ static bool atpx_present(void) acpi_handle dhandle, atpx_handle; acpi_status status; - while ((pdev = pci_get_class(PCI_BASE_CLASS_DISPLAY << 16, pdev)) != NULL) { + while ((pdev = pci_get_class(PCI_CLASS_DISPLAY_VGA << 8, pdev)) != NULL) { + dhandle = ACPI_HANDLE(&pdev->dev); + if (dhandle) { + status = acpi_get_handle(dhandle, "ATPX", &atpx_handle); + if (!ACPI_FAILURE(status)) { + pci_dev_put(pdev); + return true; + } + } + } + while ((pdev = pci_get_class(PCI_CLASS_DISPLAY_OTHER << 8, pdev)) != NULL) { dhandle = ACPI_HANDLE(&pdev->dev); if (dhandle) { status = acpi_get_handle(dhandle, "ATPX", &atpx_handle); From 43cf75d96409a20ef06b756877a2e72b10a026fc Mon Sep 17 00:00:00 2001 From: chenqiwu Date: Thu, 19 Dec 2019 14:29:53 +0800 Subject: [PATCH 033/146] exit: panic before exit_mm() on global init exit Currently, when global init and all threads in its thread-group have exited we panic via: do_exit() -> exit_notify() -> forget_original_parent() -> find_child_reaper() This makes it hard to extract a useable coredump for global init from a kernel crashdump because by the time we panic exit_mm() will have already released global init's mm. This patch moves the panic futher up before exit_mm() is called. As was the case previously, we only panic when global init and all its threads in the thread-group have exited. Signed-off-by: chenqiwu Acked-by: Christian Brauner Acked-by: Oleg Nesterov [christian.brauner@ubuntu.com: fix typo, rewrite commit message] Link: https://lore.kernel.org/r/1576736993-10121-1-git-send-email-qiwuchen55@gmail.com Signed-off-by: Christian Brauner --- kernel/exit.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/kernel/exit.c b/kernel/exit.c index a46a50d67002..fc364272759d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -517,10 +517,6 @@ static struct task_struct *find_child_reaper(struct task_struct *father, } write_unlock_irq(&tasklist_lock); - if (unlikely(pid_ns == &init_pid_ns)) { - panic("Attempted to kill init! exitcode=0x%08x\n", - father->signal->group_exit_code ?: father->exit_code); - } list_for_each_entry_safe(p, n, dead, ptrace_entry) { list_del_init(&p->ptrace_entry); @@ -786,6 +782,14 @@ void __noreturn do_exit(long code) acct_update_integrals(tsk); group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { + /* + * If the last thread of global init has exited, panic + * immediately to get a useable coredump. + */ + if (unlikely(is_global_init(tsk))) + panic("Attempted to kill init! exitcode=0x%08x\n", + tsk->signal->group_exit_code ?: (int)code); + #ifdef CONFIG_POSIX_TIMERS hrtimer_cancel(&tsk->signal->real_timer); exit_itimers(tsk->signal); From f54c7898ed1c3c9331376c0337a5049c38f66497 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 22 Dec 2019 23:37:40 +0100 Subject: [PATCH 034/146] bpf: Fix precision tracking for unbounded scalars Anatoly has been fuzzing with kBdysch harness and reported a hang in one of the outcomes. Upon closer analysis, it turns out that precise scalar value tracking is missing a few precision markings for unknown scalars: 0: R1=ctx(id=0,off=0,imm=0) R10=fp0 0: (b7) r0 = 0 1: R0_w=invP0 R1=ctx(id=0,off=0,imm=0) R10=fp0 1: (35) if r0 >= 0xf72e goto pc+0 --> only follow fallthrough 2: R0_w=invP0 R1=ctx(id=0,off=0,imm=0) R10=fp0 2: (35) if r0 >= 0x80fe0000 goto pc+0 --> only follow fallthrough 3: R0_w=invP0 R1=ctx(id=0,off=0,imm=0) R10=fp0 3: (14) w0 -= -536870912 4: R0_w=invP536870912 R1=ctx(id=0,off=0,imm=0) R10=fp0 4: (0f) r1 += r0 5: R0_w=invP536870912 R1_w=inv(id=0) R10=fp0 5: (55) if r1 != 0x104c1500 goto pc+0 --> push other branch for later analysis R0_w=invP536870912 R1_w=inv273421568 R10=fp0 6: R0_w=invP536870912 R1_w=inv273421568 R10=fp0 6: (b7) r0 = 0 7: R0=invP0 R1=inv273421568 R10=fp0 7: (76) if w1 s>= 0xffffff00 goto pc+3 --> only follow goto 11: R0=invP0 R1=inv273421568 R10=fp0 11: (95) exit 6: R0_w=invP536870912 R1_w=inv(id=0) R10=fp0 6: (b7) r0 = 0 propagating r0 7: safe processed 11 insns [...] In the analysis of the second path coming after the successful exit above, the path is being pruned at line 7. Pruning analysis found that both r0 are precise P0 and both R1 are non-precise scalars and given prior path with R1 as non-precise scalar succeeded, this one is therefore safe as well. However, problem is that given condition at insn 7 in the first run, we only followed goto and didn't push the other branch for later analysis, we've never walked the few insns in there and therefore dead-code sanitation rewrites it as goto pc-1, causing the hang depending on the skb address hitting these conditions. The issue is that R1 should have been marked as precise as well such that pruning enforces range check and conluded that new R1 is not in range of old R1. In insn 4, we mark R1 (skb) as unknown scalar via __mark_reg_unbounded() but not mark_reg_unbounded() and therefore regs->precise remains as false. Back in b5dc0163d8fd ("bpf: precise scalar_value tracking"), this was not the case since marking out of __mark_reg_unbounded() had this covered as well. Once in both are set as precise in 4 as they should have been, we conclude that given R1 was in prior fall-through path 0x104c1500 and now is completely unknown, the check at insn 7 concludes that we need to continue walking. Analysis after the fix: 0: R1=ctx(id=0,off=0,imm=0) R10=fp0 0: (b7) r0 = 0 1: R0_w=invP0 R1=ctx(id=0,off=0,imm=0) R10=fp0 1: (35) if r0 >= 0xf72e goto pc+0 2: R0_w=invP0 R1=ctx(id=0,off=0,imm=0) R10=fp0 2: (35) if r0 >= 0x80fe0000 goto pc+0 3: R0_w=invP0 R1=ctx(id=0,off=0,imm=0) R10=fp0 3: (14) w0 -= -536870912 4: R0_w=invP536870912 R1=ctx(id=0,off=0,imm=0) R10=fp0 4: (0f) r1 += r0 5: R0_w=invP536870912 R1_w=invP(id=0) R10=fp0 5: (55) if r1 != 0x104c1500 goto pc+0 R0_w=invP536870912 R1_w=invP273421568 R10=fp0 6: R0_w=invP536870912 R1_w=invP273421568 R10=fp0 6: (b7) r0 = 0 7: R0=invP0 R1=invP273421568 R10=fp0 7: (76) if w1 s>= 0xffffff00 goto pc+3 11: R0=invP0 R1=invP273421568 R10=fp0 11: (95) exit 6: R0_w=invP536870912 R1_w=invP(id=0) R10=fp0 6: (b7) r0 = 0 7: R0_w=invP0 R1_w=invP(id=0) R10=fp0 7: (76) if w1 s>= 0xffffff00 goto pc+3 R0_w=invP0 R1_w=invP(id=0) R10=fp0 8: R0_w=invP0 R1_w=invP(id=0) R10=fp0 8: (a5) if r0 < 0x2007002a goto pc+0 9: R0_w=invP0 R1_w=invP(id=0) R10=fp0 9: (57) r0 &= -16316416 10: R0_w=invP0 R1_w=invP(id=0) R10=fp0 10: (a6) if w0 < 0x1201 goto pc+0 11: R0_w=invP0 R1_w=invP(id=0) R10=fp0 11: (95) exit 11: R0=invP0 R1=invP(id=0) R10=fp0 11: (95) exit processed 16 insns [...] Fixes: 6754172c208d ("bpf: fix precision tracking in presence of bpf2bpf calls") Reported-by: Anatoly Trosinenko Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191222223740.25297-1-daniel@iogearbox.net --- kernel/bpf/verifier.c | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4983940cbdca..6f63ae7a370c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -907,7 +907,8 @@ static const int caller_saved[CALLER_SAVED_REGS] = { BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 }; -static void __mark_reg_not_init(struct bpf_reg_state *reg); +static void __mark_reg_not_init(const struct bpf_verifier_env *env, + struct bpf_reg_state *reg); /* Mark the unknown part of a register (variable offset or scalar value) as * known to have the value @imm. @@ -945,7 +946,7 @@ static void mark_reg_known_zero(struct bpf_verifier_env *env, verbose(env, "mark_reg_known_zero(regs, %u)\n", regno); /* Something bad happened, let's kill all regs */ for (regno = 0; regno < MAX_BPF_REG; regno++) - __mark_reg_not_init(regs + regno); + __mark_reg_not_init(env, regs + regno); return; } __mark_reg_known_zero(regs + regno); @@ -1054,7 +1055,8 @@ static void __mark_reg_unbounded(struct bpf_reg_state *reg) } /* Mark a register as having a completely unknown (scalar) value. */ -static void __mark_reg_unknown(struct bpf_reg_state *reg) +static void __mark_reg_unknown(const struct bpf_verifier_env *env, + struct bpf_reg_state *reg) { /* * Clear type, id, off, and union(map_ptr, range) and @@ -1064,6 +1066,8 @@ static void __mark_reg_unknown(struct bpf_reg_state *reg) reg->type = SCALAR_VALUE; reg->var_off = tnum_unknown; reg->frameno = 0; + reg->precise = env->subprog_cnt > 1 || !env->allow_ptr_leaks ? + true : false; __mark_reg_unbounded(reg); } @@ -1074,19 +1078,16 @@ static void mark_reg_unknown(struct bpf_verifier_env *env, verbose(env, "mark_reg_unknown(regs, %u)\n", regno); /* Something bad happened, let's kill all regs except FP */ for (regno = 0; regno < BPF_REG_FP; regno++) - __mark_reg_not_init(regs + regno); + __mark_reg_not_init(env, regs + regno); return; } - regs += regno; - __mark_reg_unknown(regs); - /* constant backtracking is enabled for root without bpf2bpf calls */ - regs->precise = env->subprog_cnt > 1 || !env->allow_ptr_leaks ? - true : false; + __mark_reg_unknown(env, regs + regno); } -static void __mark_reg_not_init(struct bpf_reg_state *reg) +static void __mark_reg_not_init(const struct bpf_verifier_env *env, + struct bpf_reg_state *reg) { - __mark_reg_unknown(reg); + __mark_reg_unknown(env, reg); reg->type = NOT_INIT; } @@ -1097,10 +1098,10 @@ static void mark_reg_not_init(struct bpf_verifier_env *env, verbose(env, "mark_reg_not_init(regs, %u)\n", regno); /* Something bad happened, let's kill all regs except FP */ for (regno = 0; regno < BPF_REG_FP; regno++) - __mark_reg_not_init(regs + regno); + __mark_reg_not_init(env, regs + regno); return; } - __mark_reg_not_init(regs + regno); + __mark_reg_not_init(env, regs + regno); } #define DEF_NOT_SUBREG (0) @@ -3234,7 +3235,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, } if (state->stack[spi].slot_type[0] == STACK_SPILL && state->stack[spi].spilled_ptr.type == SCALAR_VALUE) { - __mark_reg_unknown(&state->stack[spi].spilled_ptr); + __mark_reg_unknown(env, &state->stack[spi].spilled_ptr); for (j = 0; j < BPF_REG_SIZE; j++) state->stack[spi].slot_type[j] = STACK_MISC; goto mark; @@ -3892,7 +3893,7 @@ static void __clear_all_pkt_pointers(struct bpf_verifier_env *env, if (!reg) continue; if (reg_is_pkt_pointer_any(reg)) - __mark_reg_unknown(reg); + __mark_reg_unknown(env, reg); } } @@ -3920,7 +3921,7 @@ static void release_reg_references(struct bpf_verifier_env *env, if (!reg) continue; if (reg->ref_obj_id == ref_obj_id) - __mark_reg_unknown(reg); + __mark_reg_unknown(env, reg); } } @@ -4582,7 +4583,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, /* Taint dst register if offset had invalid bounds derived from * e.g. dead branches. */ - __mark_reg_unknown(dst_reg); + __mark_reg_unknown(env, dst_reg); return 0; } @@ -4834,13 +4835,13 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, /* Taint dst register if offset had invalid bounds derived from * e.g. dead branches. */ - __mark_reg_unknown(dst_reg); + __mark_reg_unknown(env, dst_reg); return 0; } if (!src_known && opcode != BPF_ADD && opcode != BPF_SUB && opcode != BPF_AND) { - __mark_reg_unknown(dst_reg); + __mark_reg_unknown(env, dst_reg); return 0; } @@ -6982,7 +6983,7 @@ static void clean_func_state(struct bpf_verifier_env *env, /* since the register is unused, clear its state * to make further comparison simpler */ - __mark_reg_not_init(&st->regs[i]); + __mark_reg_not_init(env, &st->regs[i]); } for (i = 0; i < st->allocated_stack / BPF_REG_SIZE; i++) { @@ -6990,7 +6991,7 @@ static void clean_func_state(struct bpf_verifier_env *env, /* liveness must not touch this stack slot anymore */ st->stack[i].spilled_ptr.live |= REG_LIVE_DONE; if (!(live & REG_LIVE_READ)) { - __mark_reg_not_init(&st->stack[i].spilled_ptr); + __mark_reg_not_init(env, &st->stack[i].spilled_ptr); for (j = 0; j < BPF_REG_SIZE; j++) st->stack[i].slot_type[j] = STACK_INVALID; } From 2f42e05b942fe2fbfb9bbc6e34e1dd8c3ce4f3a4 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 18 Dec 2019 19:09:06 +0000 Subject: [PATCH 035/146] dmaengine: k3dma: Avoid null pointer traversal In some cases we seem to submit two transactions in a row, which causes us to lose track of the first. If we then cancel the request, we may still get an interrupt, which traverses a null ds_run value. So try to avoid starting a new transaction if the ds_run value is set. While this patch avoids the null pointer crash, I've had some reports of the k3dma driver still getting confused, which suggests the ds_run/ds_done value handling still isn't quite right. However, I've not run into an issue recently with it so I think this patch is worth pushing upstream to avoid the crash. Signed-off-by: John Stultz [add ss tag] Link: https://lore.kernel.org/r/20191218190906.6641-1-john.stultz@linaro.org Signed-off-by: Vinod Koul --- drivers/dma/k3dma.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/dma/k3dma.c b/drivers/dma/k3dma.c index adecea51814f..c5c1aa0dcaed 100644 --- a/drivers/dma/k3dma.c +++ b/drivers/dma/k3dma.c @@ -229,9 +229,11 @@ static irqreturn_t k3_dma_int_handler(int irq, void *dev_id) c = p->vchan; if (c && (tc1 & BIT(i))) { spin_lock_irqsave(&c->vc.lock, flags); - vchan_cookie_complete(&p->ds_run->vd); - p->ds_done = p->ds_run; - p->ds_run = NULL; + if (p->ds_run != NULL) { + vchan_cookie_complete(&p->ds_run->vd); + p->ds_done = p->ds_run; + p->ds_run = NULL; + } spin_unlock_irqrestore(&c->vc.lock, flags); } if (c && (tc2 & BIT(i))) { @@ -271,6 +273,10 @@ static int k3_dma_start_txd(struct k3_dma_chan *c) if (BIT(c->phy->idx) & k3_dma_get_chan_stat(d)) return -EAGAIN; + /* Avoid losing track of ds_run if a transaction is in flight */ + if (c->phy->ds_run) + return -EAGAIN; + if (vd) { struct k3_dma_desc_sw *ds = container_of(vd, struct k3_dma_desc_sw, vd); From 24461d9792c2c706092805ff1b067628933441bd Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Fri, 20 Dec 2019 15:11:00 +0200 Subject: [PATCH 036/146] dmaengine: virt-dma: Fix access after free in vchan_complete() vchan_vdesc_fini() is freeing up 'vd' so the access to vd->tx_result is via already freed up memory. Move the vchan_vdesc_fini() after invoking the callback to avoid this. Fixes: 09d5b702b0f97 ("dmaengine: virt-dma: store result on dma descriptor") Signed-off-by: Peter Ujfalusi Reviewed-by: Alexandru Ardelean Link: https://lore.kernel.org/r/20191220131100.21804-1-peter.ujfalusi@ti.com Signed-off-by: Vinod Koul --- drivers/dma/virt-dma.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/dma/virt-dma.c b/drivers/dma/virt-dma.c index ec4adf4260a0..256fc662c500 100644 --- a/drivers/dma/virt-dma.c +++ b/drivers/dma/virt-dma.c @@ -104,9 +104,8 @@ static void vchan_complete(unsigned long arg) dmaengine_desc_get_callback(&vd->tx, &cb); list_del(&vd->node); - vchan_vdesc_fini(vd); - dmaengine_desc_callback_invoke(&cb, &vd->tx_result); + vchan_vdesc_fini(vd); } } From 91a063c956084fb21cf2523bce6892514e3f1799 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Sat, 21 Dec 2019 14:16:54 +1100 Subject: [PATCH 037/146] powerpc/mm: Mark get_slice_psize() & slice_addr_is_low() as notrace These slice routines are called from the SLB miss handler, which can lead to warnings from the IRQ code, because we have not reconciled the IRQ state properly: WARNING: CPU: 72 PID: 30150 at arch/powerpc/kernel/irq.c:258 arch_local_irq_restore.part.0+0xcc/0x100 Modules linked in: CPU: 72 PID: 30150 Comm: ftracetest Not tainted 5.5.0-rc2-gcc9x-g7e0165b2f1a9 #1 NIP: c00000000001d83c LR: c00000000029ab90 CTR: c00000000026cf90 REGS: c0000007eee3b960 TRAP: 0700 Not tainted (5.5.0-rc2-gcc9x-g7e0165b2f1a9) MSR: 8000000000021033 CR: 22242844 XER: 20000000 CFAR: c00000000001d780 IRQMASK: 0 ... NIP arch_local_irq_restore.part.0+0xcc/0x100 LR trace_graph_entry+0x270/0x340 Call Trace: trace_graph_entry+0x254/0x340 (unreliable) function_graph_enter+0xe4/0x1a0 prepare_ftrace_return+0xa0/0x130 ftrace_graph_caller+0x44/0x94 # (get_slice_psize()) slb_allocate_user+0x7c/0x100 do_slb_fault+0xf8/0x300 instruction_access_slb_common+0x140/0x180 Fixes: 48e7b7695745 ("powerpc/64s/hash: Convert SLB miss handlers to C") Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20191221121337.4894-1-mpe@ellerman.id.au --- arch/powerpc/mm/slice.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index 42bbcd47cc85..dffe1a45b6ed 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -50,7 +50,7 @@ static void slice_print_mask(const char *label, const struct slice_mask *mask) { #endif -static inline bool slice_addr_is_low(unsigned long addr) +static inline notrace bool slice_addr_is_low(unsigned long addr) { u64 tmp = (u64)addr; @@ -659,7 +659,7 @@ unsigned long arch_get_unmapped_area_topdown(struct file *filp, mm_ctx_user_psize(¤t->mm->context), 1); } -unsigned int get_slice_psize(struct mm_struct *mm, unsigned long addr) +unsigned int notrace get_slice_psize(struct mm_struct *mm, unsigned long addr) { unsigned char *psizes; int index, mask_index; From fa633a0f89192379828103957874682d389eae83 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 23 Dec 2019 15:13:26 +0900 Subject: [PATCH 038/146] libbpf: Fix build on read-only filesystems I got the following error when I tried to build perf on a read-only filesystem with O=dir option. $ cd /some/where/ro/linux/tools/perf $ make O=$HOME/build/perf ... CC /home/namhyung/build/perf/lib.o /bin/sh: bpf_helper_defs.h: Read-only file system make[3]: *** [Makefile:184: bpf_helper_defs.h] Error 1 make[2]: *** [Makefile.perf:778: /home/namhyung/build/perf/libbpf.a] Error 2 make[2]: *** Waiting for unfinished jobs.... LD /home/namhyung/build/perf/libperf-in.o AR /home/namhyung/build/perf/libperf.a PERF_VERSION = 5.4.0 make[1]: *** [Makefile.perf:225: sub-make] Error 2 make: *** [Makefile:70: all] Error 2 It was becaused bpf_helper_defs.h was generated in current directory. Move it to OUTPUT directory. Signed-off-by: Namhyung Kim Signed-off-by: Daniel Borkmann Tested-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20191223061326.843366-1-namhyung@kernel.org --- tools/lib/bpf/Makefile | 15 ++++++++------- tools/testing/selftests/bpf/.gitignore | 1 + tools/testing/selftests/bpf/Makefile | 6 +++--- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile index defae23a0169..97830e46d1a0 100644 --- a/tools/lib/bpf/Makefile +++ b/tools/lib/bpf/Makefile @@ -138,6 +138,7 @@ STATIC_OBJDIR := $(OUTPUT)staticobjs/ BPF_IN_SHARED := $(SHARED_OBJDIR)libbpf-in.o BPF_IN_STATIC := $(STATIC_OBJDIR)libbpf-in.o VERSION_SCRIPT := libbpf.map +BPF_HELPER_DEFS := $(OUTPUT)bpf_helper_defs.h LIB_TARGET := $(addprefix $(OUTPUT),$(LIB_TARGET)) LIB_FILE := $(addprefix $(OUTPUT),$(LIB_FILE)) @@ -159,7 +160,7 @@ all: fixdep all_cmd: $(CMD_TARGETS) check -$(BPF_IN_SHARED): force elfdep bpfdep bpf_helper_defs.h +$(BPF_IN_SHARED): force elfdep bpfdep $(BPF_HELPER_DEFS) @(test -f ../../include/uapi/linux/bpf.h -a -f ../../../include/uapi/linux/bpf.h && ( \ (diff -B ../../include/uapi/linux/bpf.h ../../../include/uapi/linux/bpf.h >/dev/null) || \ echo "Warning: Kernel ABI header at 'tools/include/uapi/linux/bpf.h' differs from latest version at 'include/uapi/linux/bpf.h'" >&2 )) || true @@ -177,12 +178,12 @@ $(BPF_IN_SHARED): force elfdep bpfdep bpf_helper_defs.h echo "Warning: Kernel ABI header at 'tools/include/uapi/linux/if_xdp.h' differs from latest version at 'include/uapi/linux/if_xdp.h'" >&2 )) || true $(Q)$(MAKE) $(build)=libbpf OUTPUT=$(SHARED_OBJDIR) CFLAGS="$(CFLAGS) $(SHLIB_FLAGS)" -$(BPF_IN_STATIC): force elfdep bpfdep bpf_helper_defs.h +$(BPF_IN_STATIC): force elfdep bpfdep $(BPF_HELPER_DEFS) $(Q)$(MAKE) $(build)=libbpf OUTPUT=$(STATIC_OBJDIR) -bpf_helper_defs.h: $(srctree)/tools/include/uapi/linux/bpf.h +$(BPF_HELPER_DEFS): $(srctree)/tools/include/uapi/linux/bpf.h $(Q)$(srctree)/scripts/bpf_helpers_doc.py --header \ - --file $(srctree)/tools/include/uapi/linux/bpf.h > bpf_helper_defs.h + --file $(srctree)/tools/include/uapi/linux/bpf.h > $(BPF_HELPER_DEFS) $(OUTPUT)libbpf.so: $(OUTPUT)libbpf.so.$(LIBBPF_VERSION) @@ -243,7 +244,7 @@ install_lib: all_cmd $(call do_install_mkdir,$(libdir_SQ)); \ cp -fpR $(LIB_FILE) $(DESTDIR)$(libdir_SQ) -install_headers: bpf_helper_defs.h +install_headers: $(BPF_HELPER_DEFS) $(call QUIET_INSTALL, headers) \ $(call do_install,bpf.h,$(prefix)/include/bpf,644); \ $(call do_install,libbpf.h,$(prefix)/include/bpf,644); \ @@ -251,7 +252,7 @@ install_headers: bpf_helper_defs.h $(call do_install,libbpf_util.h,$(prefix)/include/bpf,644); \ $(call do_install,xsk.h,$(prefix)/include/bpf,644); \ $(call do_install,bpf_helpers.h,$(prefix)/include/bpf,644); \ - $(call do_install,bpf_helper_defs.h,$(prefix)/include/bpf,644); \ + $(call do_install,$(BPF_HELPER_DEFS),$(prefix)/include/bpf,644); \ $(call do_install,bpf_tracing.h,$(prefix)/include/bpf,644); \ $(call do_install,bpf_endian.h,$(prefix)/include/bpf,644); \ $(call do_install,bpf_core_read.h,$(prefix)/include/bpf,644); @@ -271,7 +272,7 @@ config-clean: clean: $(call QUIET_CLEAN, libbpf) $(RM) -rf $(CMD_TARGETS) \ *.o *~ *.a *.so *.so.$(LIBBPF_MAJOR_VERSION) .*.d .*.cmd \ - *.pc LIBBPF-CFLAGS bpf_helper_defs.h \ + *.pc LIBBPF-CFLAGS $(BPF_HELPER_DEFS) \ $(SHARED_OBJDIR) $(STATIC_OBJDIR) $(call QUIET_CLEAN, core-gen) $(RM) $(OUTPUT)FEATURE-DUMP.libbpf diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index 419652458da4..1ff0a9f49c01 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -40,3 +40,4 @@ xdping test_cpp /no_alu32 /bpf_gcc +bpf_helper_defs.h diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index e0fe01d9ec33..e2fd6f8d579c 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -120,9 +120,9 @@ force: $(BPFOBJ): force $(MAKE) -C $(BPFDIR) OUTPUT=$(OUTPUT)/ -BPF_HELPERS := $(BPFDIR)/bpf_helper_defs.h $(wildcard $(BPFDIR)/bpf_*.h) -$(BPFDIR)/bpf_helper_defs.h: - $(MAKE) -C $(BPFDIR) OUTPUT=$(OUTPUT)/ bpf_helper_defs.h +BPF_HELPERS := $(OUTPUT)/bpf_helper_defs.h $(wildcard $(BPFDIR)/bpf_*.h) +$(OUTPUT)/bpf_helper_defs.h: + $(MAKE) -C $(BPFDIR) OUTPUT=$(OUTPUT)/ $(OUTPUT)/bpf_helper_defs.h # Get Clang's default includes on this system, as opposed to those seen by # '-target bpf'. This fixes "missing" files on some architectures/distros, From c366b3dbbab14b28d044b94eb9ce77c23482ea35 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 23 Dec 2019 23:18:16 +0100 Subject: [PATCH 039/146] ALSA: hda - Apply sync-write workaround to old Intel platforms, too Klaus Ethgen reported occasional high CPU usages in his system that seem caused by HD-audio driver. The perf output revealed that it's in the unsolicited event handling in the workqueue, and the problem seems triggered by some communication stall between the controller and the codec at the runtime or system resume. Actually a similar phenomenon was seen in the past for other Intel platforms, and we already applied the workaround to enforce sync-write for CORB/RIRB verbs for Skylake and newer chipsets (commit 2756d9143aa5 "ALSA: hda - Fix intermittent CORB/RIRB stall on Intel chips"). Fortunately, the same workaround is applicable to the old chipset, and the experiment showed the positive effect. Based on the experiment result, this patch enables the sync-write workaround for all Intel chipsets. The only reason I hesitated to apply this workaround was about the possibly slightly higher CPU usage. But if the lack of sync causes a much severer problem even for quite old chip, we should think this would be necessary for all Intel chips. Reported-by: Klaus Ethgen Cc: Link: https://lore.kernel.org/r/20191223171833.GA17053@chua Link: https://lore.kernel.org/r/20191223221816.32572-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/pci/hda/hda_intel.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c index f69c8de64bd6..5b92f290cbb0 100644 --- a/sound/pci/hda/hda_intel.c +++ b/sound/pci/hda/hda_intel.c @@ -282,12 +282,13 @@ enum { /* quirks for old Intel chipsets */ #define AZX_DCAPS_INTEL_ICH \ - (AZX_DCAPS_OLD_SSYNC | AZX_DCAPS_NO_ALIGN_BUFSIZE) + (AZX_DCAPS_OLD_SSYNC | AZX_DCAPS_NO_ALIGN_BUFSIZE |\ + AZX_DCAPS_SYNC_WRITE) /* quirks for Intel PCH */ #define AZX_DCAPS_INTEL_PCH_BASE \ (AZX_DCAPS_NO_ALIGN_BUFSIZE | AZX_DCAPS_COUNT_LPIB_DELAY |\ - AZX_DCAPS_SNOOP_TYPE(SCH)) + AZX_DCAPS_SNOOP_TYPE(SCH) | AZX_DCAPS_SYNC_WRITE) /* PCH up to IVB; no runtime PM; bind with i915 gfx */ #define AZX_DCAPS_INTEL_PCH_NOPM \ @@ -302,13 +303,13 @@ enum { #define AZX_DCAPS_INTEL_HASWELL \ (/*AZX_DCAPS_ALIGN_BUFSIZE |*/ AZX_DCAPS_COUNT_LPIB_DELAY |\ AZX_DCAPS_PM_RUNTIME | AZX_DCAPS_I915_COMPONENT |\ - AZX_DCAPS_SNOOP_TYPE(SCH)) + AZX_DCAPS_SNOOP_TYPE(SCH) | AZX_DCAPS_SYNC_WRITE) /* Broadwell HDMI can't use position buffer reliably, force to use LPIB */ #define AZX_DCAPS_INTEL_BROADWELL \ (/*AZX_DCAPS_ALIGN_BUFSIZE |*/ AZX_DCAPS_POSFIX_LPIB |\ AZX_DCAPS_PM_RUNTIME | AZX_DCAPS_I915_COMPONENT |\ - AZX_DCAPS_SNOOP_TYPE(SCH)) + AZX_DCAPS_SNOOP_TYPE(SCH) | AZX_DCAPS_SYNC_WRITE) #define AZX_DCAPS_INTEL_BAYTRAIL \ (AZX_DCAPS_INTEL_PCH_BASE | AZX_DCAPS_I915_COMPONENT) From feed8a4fc9d46c3126fb9fcae0e9248270c6321a Mon Sep 17 00:00:00 2001 From: Antonio Messina Date: Thu, 19 Dec 2019 15:08:03 +0100 Subject: [PATCH 040/146] udp: fix integer overflow while computing available space in sk_rcvbuf When the size of the receive buffer for a socket is close to 2^31 when computing if we have enough space in the buffer to copy a packet from the queue to the buffer we might hit an integer overflow. When an user set net.core.rmem_default to a value close to 2^31 UDP packets are dropped because of this overflow. This can be visible, for instance, with failure to resolve hostnames. This can be fixed by casting sk_rcvbuf (which is an int) to unsigned int, similarly to how it is done in TCP. Signed-off-by: Antonio Messina Signed-off-by: David S. Miller --- net/ipv4/udp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 4da5758cc718..93a355b6b092 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1475,7 +1475,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) * queue contains some other skb */ rmem = atomic_add_return(size, &sk->sk_rmem_alloc); - if (rmem > (size + sk->sk_rcvbuf)) + if (rmem > (size + (unsigned int)sk->sk_rcvbuf)) goto uncharge_drop; spin_lock(&list->lock); From 61d5d4062876e21331c3d0ba4b02dbd50c06a658 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Fri, 20 Dec 2019 15:03:44 -0300 Subject: [PATCH 041/146] sctp: fix err handling of stream initialization The fix on 951c6db954a1 fixed the issued reported there but introduced another. When the allocation fails within sctp_stream_init() it is okay/necessary to free the genradix. But it is also called when adding new streams, from sctp_send_add_streams() and sctp_process_strreset_addstrm_in() and in those situations it cannot just free the genradix because by then it is a fully operational association. The fix here then is to only free the genradix in sctp_stream_init() and on those other call sites move on with what it already had and let the subsequent error handling to handle it. Tested with the reproducers from this report and the previous one, with lksctp-tools and sctp-tests. Reported-by: syzbot+9a1bc632e78a1a98488b@syzkaller.appspotmail.com Fixes: 951c6db954a1 ("sctp: fix memleak on err handling of stream initialization") Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/stream.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/net/sctp/stream.c b/net/sctp/stream.c index 6a30392068a0..c1a100d2fed3 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -84,10 +84,8 @@ static int sctp_stream_alloc_out(struct sctp_stream *stream, __u16 outcnt, return 0; ret = genradix_prealloc(&stream->out, outcnt, gfp); - if (ret) { - genradix_free(&stream->out); + if (ret) return ret; - } stream->outcnt = outcnt; return 0; @@ -102,10 +100,8 @@ static int sctp_stream_alloc_in(struct sctp_stream *stream, __u16 incnt, return 0; ret = genradix_prealloc(&stream->in, incnt, gfp); - if (ret) { - genradix_free(&stream->in); + if (ret) return ret; - } stream->incnt = incnt; return 0; @@ -123,7 +119,7 @@ int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt, * a new one with new outcnt to save memory if needed. */ if (outcnt == stream->outcnt) - goto in; + goto handle_in; /* Filter out chunks queued on streams that won't exist anymore */ sched->unsched_all(stream); @@ -132,24 +128,28 @@ int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt, ret = sctp_stream_alloc_out(stream, outcnt, gfp); if (ret) - goto out; + goto out_err; for (i = 0; i < stream->outcnt; i++) SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN; -in: +handle_in: sctp_stream_interleave_init(stream); if (!incnt) goto out; ret = sctp_stream_alloc_in(stream, incnt, gfp); - if (ret) { - sched->free(stream); - genradix_free(&stream->out); - stream->outcnt = 0; - goto out; - } + if (ret) + goto in_err; + goto out; + +in_err: + sched->free(stream); + genradix_free(&stream->in); +out_err: + genradix_free(&stream->out); + stream->outcnt = 0; out: return ret; } From 7c3125f0a6ebc17846c5908ad7d6056d66c1c426 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 20 Dec 2019 11:24:21 -0800 Subject: [PATCH 042/146] net: dsa: bcm_sf2: Fix IP fragment location and behavior The IP fragment is specified through user-defined field as the first bit of the first user-defined word. We were previously trying to extract it from the user-defined mask which could not possibly work. The ip_frag is also supposed to be a boolean, if we do not cast it as such, we risk overwriting the next fields in CFP_DATA(6) which would render the rule inoperative. Fixes: 7318166cacad ("net: dsa: bcm_sf2: Add support for ethtool::rxnfc") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/bcm_sf2_cfp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/dsa/bcm_sf2_cfp.c b/drivers/net/dsa/bcm_sf2_cfp.c index f3f0c3f07391..1962c8330daa 100644 --- a/drivers/net/dsa/bcm_sf2_cfp.c +++ b/drivers/net/dsa/bcm_sf2_cfp.c @@ -358,7 +358,7 @@ static int bcm_sf2_cfp_ipv4_rule_set(struct bcm_sf2_priv *priv, int port, return -EINVAL; } - ip_frag = be32_to_cpu(fs->m_ext.data[0]); + ip_frag = !!(be32_to_cpu(fs->h_ext.data[0]) & 1); /* Locate the first rule available */ if (fs->location == RX_CLS_LOC_ANY) @@ -569,7 +569,7 @@ static int bcm_sf2_cfp_rule_cmp(struct bcm_sf2_priv *priv, int port, if (rule->fs.flow_type != fs->flow_type || rule->fs.ring_cookie != fs->ring_cookie || - rule->fs.m_ext.data[0] != fs->m_ext.data[0]) + rule->fs.h_ext.data[0] != fs->h_ext.data[0]) continue; switch (fs->flow_type & ~FLOW_EXT) { @@ -621,7 +621,7 @@ static int bcm_sf2_cfp_ipv6_rule_set(struct bcm_sf2_priv *priv, int port, return -EINVAL; } - ip_frag = be32_to_cpu(fs->m_ext.data[0]); + ip_frag = !!(be32_to_cpu(fs->h_ext.data[0]) & 1); layout = &udf_tcpip6_layout; slice_num = bcm_sf2_get_slice_number(layout, 0); From bd085ef678b2cc8c38c105673dfe8ff8f5ec0c57 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Sun, 22 Dec 2019 10:51:09 +0800 Subject: [PATCH 043/146] net: add bool confirm_neigh parameter for dst_ops.update_pmtu The MTU update code is supposed to be invoked in response to real networking events that update the PMTU. In IPv6 PMTU update function __ip6_rt_update_pmtu() we called dst_confirm_neigh() to update neighbor confirmed time. But for tunnel code, it will call pmtu before xmit, like: - tnl_update_pmtu() - skb_dst_update_pmtu() - ip6_rt_update_pmtu() - __ip6_rt_update_pmtu() - dst_confirm_neigh() If the tunnel remote dst mac address changed and we still do the neigh confirm, we will not be able to update neigh cache and ping6 remote will failed. So for this ip_tunnel_xmit() case, _EVEN_ if the MTU is changed, we should not be invoking dst_confirm_neigh() as we have no evidence of successful two-way communication at this point. On the other hand it is also important to keep the neigh reachability fresh for TCP flows, so we cannot remove this dst_confirm_neigh() call. To fix the issue, we have to add a new bool parameter for dst_ops.update_pmtu to choose whether we should do neigh update or not. I will add the parameter in this patch and set all the callers to true to comply with the previous way, and fix the tunnel code one by one on later patches. v5: No change. v4: No change. v3: Do not remove dst_confirm_neigh, but add a new bool parameter in dst_ops.update_pmtu to control whether we should do neighbor confirm. Also split the big patch to small ones for each area. v2: Remove dst_confirm_neigh in __ip6_rt_update_pmtu. Suggested-by: David Miller Reviewed-by: Guillaume Nault Acked-by: David Ahern Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- drivers/net/gtp.c | 2 +- include/net/dst.h | 2 +- include/net/dst_ops.h | 3 ++- net/bridge/br_nf_core.c | 3 ++- net/decnet/dn_route.c | 6 ++++-- net/ipv4/inet_connection_sock.c | 2 +- net/ipv4/route.c | 9 ++++++--- net/ipv4/xfrm4_policy.c | 5 +++-- net/ipv6/inet6_connection_sock.c | 2 +- net/ipv6/ip6_gre.c | 2 +- net/ipv6/route.c | 22 +++++++++++++++------- net/ipv6/xfrm6_policy.c | 5 +++-- net/netfilter/ipvs/ip_vs_xmit.c | 2 +- net/sctp/transport.c | 2 +- 14 files changed, 42 insertions(+), 25 deletions(-) diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c index e5b7d6d2286e..913062017be9 100644 --- a/drivers/net/gtp.c +++ b/drivers/net/gtp.c @@ -540,7 +540,7 @@ static int gtp_build_skb_ip4(struct sk_buff *skb, struct net_device *dev, mtu = dst_mtu(&rt->dst); } - rt->dst.ops->update_pmtu(&rt->dst, NULL, skb, mtu); + rt->dst.ops->update_pmtu(&rt->dst, NULL, skb, mtu, true); if (!skb_is_gso(skb) && (iph->frag_off & htons(IP_DF)) && mtu < ntohs(iph->tot_len)) { diff --git a/include/net/dst.h b/include/net/dst.h index 8224dad2ae94..593630e0e076 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -516,7 +516,7 @@ static inline void skb_dst_update_pmtu(struct sk_buff *skb, u32 mtu) struct dst_entry *dst = skb_dst(skb); if (dst && dst->ops->update_pmtu) - dst->ops->update_pmtu(dst, NULL, skb, mtu); + dst->ops->update_pmtu(dst, NULL, skb, mtu, true); } static inline void skb_tunnel_check_pmtu(struct sk_buff *skb, diff --git a/include/net/dst_ops.h b/include/net/dst_ops.h index 5ec645f27ee3..443863c7b8da 100644 --- a/include/net/dst_ops.h +++ b/include/net/dst_ops.h @@ -27,7 +27,8 @@ struct dst_ops { struct dst_entry * (*negative_advice)(struct dst_entry *); void (*link_failure)(struct sk_buff *); void (*update_pmtu)(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, u32 mtu); + struct sk_buff *skb, u32 mtu, + bool confirm_neigh); void (*redirect)(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); int (*local_out)(struct net *net, struct sock *sk, struct sk_buff *skb); diff --git a/net/bridge/br_nf_core.c b/net/bridge/br_nf_core.c index 2cdfc5d6c25d..8c69f0c95a8e 100644 --- a/net/bridge/br_nf_core.c +++ b/net/bridge/br_nf_core.c @@ -22,7 +22,8 @@ #endif static void fake_update_pmtu(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, u32 mtu) + struct sk_buff *skb, u32 mtu, + bool confirm_neigh) { } diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index aea918135ec3..08c3dc45f1a4 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -110,7 +110,8 @@ static void dn_dst_ifdown(struct dst_entry *, struct net_device *dev, int how); static struct dst_entry *dn_dst_negative_advice(struct dst_entry *); static void dn_dst_link_failure(struct sk_buff *); static void dn_dst_update_pmtu(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb , u32 mtu); + struct sk_buff *skb , u32 mtu, + bool confirm_neigh); static void dn_dst_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); static struct neighbour *dn_dst_neigh_lookup(const struct dst_entry *dst, @@ -251,7 +252,8 @@ static int dn_dst_gc(struct dst_ops *ops) * advertise to the other end). */ static void dn_dst_update_pmtu(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, u32 mtu) + struct sk_buff *skb, u32 mtu, + bool confirm_neigh) { struct dn_route *rt = (struct dn_route *) dst; struct neighbour *n = rt->n; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index e4c6e8b40490..18c0d5bffe12 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -1086,7 +1086,7 @@ struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu) if (!dst) goto out; } - dst->ops->update_pmtu(dst, sk, NULL, mtu); + dst->ops->update_pmtu(dst, sk, NULL, mtu, true); dst = __sk_dst_check(sk, 0); if (!dst) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index f88c93c38f11..87e979f2b74a 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -139,7 +139,8 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst); static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); static void ipv4_link_failure(struct sk_buff *skb); static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, u32 mtu); + struct sk_buff *skb, u32 mtu, + bool confirm_neigh); static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); static void ipv4_dst_destroy(struct dst_entry *dst); @@ -1043,7 +1044,8 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) } static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, u32 mtu) + struct sk_buff *skb, u32 mtu, + bool confirm_neigh) { struct rtable *rt = (struct rtable *) dst; struct flowi4 fl4; @@ -2687,7 +2689,8 @@ static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst) } static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, u32 mtu) + struct sk_buff *skb, u32 mtu, + bool confirm_neigh) { } diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 35b84b52b702..9ebd54752e03 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -100,12 +100,13 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, } static void xfrm4_update_pmtu(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, u32 mtu) + struct sk_buff *skb, u32 mtu, + bool confirm_neigh) { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; struct dst_entry *path = xdst->route; - path->ops->update_pmtu(path, sk, skb, mtu); + path->ops->update_pmtu(path, sk, skb, mtu, confirm_neigh); } static void xfrm4_redirect(struct dst_entry *dst, struct sock *sk, diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index fe9cb8d1adca..e315526fa244 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -146,7 +146,7 @@ struct dst_entry *inet6_csk_update_pmtu(struct sock *sk, u32 mtu) if (IS_ERR(dst)) return NULL; - dst->ops->update_pmtu(dst, sk, NULL, mtu); + dst->ops->update_pmtu(dst, sk, NULL, mtu, true); dst = inet6_csk_route_socket(sk, &fl6); return IS_ERR(dst) ? NULL : dst; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 9d0965252ddf..3ba69174ad6c 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1040,7 +1040,7 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, /* TooBig packet may have updated dst->dev's mtu */ if (!t->parms.collect_md && dst && dst_mtu(dst) > dst->dev->mtu) - dst->ops->update_pmtu(dst, NULL, skb, dst->dev->mtu); + dst->ops->update_pmtu(dst, NULL, skb, dst->dev->mtu, true); err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, NEXTHDR_GRE); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index b59940416cb5..affb51c11a25 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -95,7 +95,8 @@ static int ip6_pkt_prohibit(struct sk_buff *skb); static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb); static void ip6_link_failure(struct sk_buff *skb); static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, u32 mtu); + struct sk_buff *skb, u32 mtu, + bool confirm_neigh); static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif, @@ -264,7 +265,8 @@ static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst) } static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, u32 mtu) + struct sk_buff *skb, u32 mtu, + bool confirm_neigh) { } @@ -2692,7 +2694,8 @@ static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) } static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, - const struct ipv6hdr *iph, u32 mtu) + const struct ipv6hdr *iph, u32 mtu, + bool confirm_neigh) { const struct in6_addr *daddr, *saddr; struct rt6_info *rt6 = (struct rt6_info *)dst; @@ -2710,7 +2713,10 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, daddr = NULL; saddr = NULL; } - dst_confirm_neigh(dst, daddr); + + if (confirm_neigh) + dst_confirm_neigh(dst, daddr); + mtu = max_t(u32, mtu, IPV6_MIN_MTU); if (mtu >= dst_mtu(dst)) return; @@ -2764,9 +2770,11 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, } static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, u32 mtu) + struct sk_buff *skb, u32 mtu, + bool confirm_neigh) { - __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu); + __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu, + confirm_neigh); } void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, @@ -2785,7 +2793,7 @@ void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, dst = ip6_route_output(net, NULL, &fl6); if (!dst->error) - __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu)); + __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu), true); dst_release(dst); } EXPORT_SYMBOL_GPL(ip6_update_pmtu); diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 699e0730ce8e..af7a4b8b1e9c 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -98,12 +98,13 @@ static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, } static void xfrm6_update_pmtu(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, u32 mtu) + struct sk_buff *skb, u32 mtu, + bool confirm_neigh) { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; struct dst_entry *path = xdst->route; - path->ops->update_pmtu(path, sk, skb, mtu); + path->ops->update_pmtu(path, sk, skb, mtu, confirm_neigh); } static void xfrm6_redirect(struct dst_entry *dst, struct sock *sk, diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index b1e300f8881b..b00866d777fe 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -208,7 +208,7 @@ static inline void maybe_update_pmtu(int skb_af, struct sk_buff *skb, int mtu) struct rtable *ort = skb_rtable(skb); if (!skb->dev && sk && sk_fullsock(sk)) - ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu); + ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu, true); } static inline bool ensure_mtu_is_adequate(struct netns_ipvs *ipvs, int skb_af, diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 7235a6032671..3bbe1a58ec87 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -263,7 +263,7 @@ bool sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu) pf->af->from_sk(&addr, sk); pf->to_sk_daddr(&t->ipaddr, sk); - dst->ops->update_pmtu(dst, sk, NULL, pmtu); + dst->ops->update_pmtu(dst, sk, NULL, pmtu, true); pf->to_sk_daddr(&addr, sk); dst = sctp_transport_dst_check(t); From 675d76ad0ad5bf41c9a129772ef0aba8f57ea9a7 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Sun, 22 Dec 2019 10:51:10 +0800 Subject: [PATCH 044/146] ip6_gre: do not confirm neighbor when do pmtu update When we do ipv6 gre pmtu update, we will also do neigh confirm currently. This will cause the neigh cache be refreshed and set to REACHABLE before xmit. But if the remote mac address changed, e.g. device is deleted and recreated, we will not able to notice this and still use the old mac address as the neigh cache is REACHABLE. Fix this by disable neigh confirm when do pmtu update v5: No change. v4: No change. v3: Do not remove dst_confirm_neigh, but add a new bool parameter in dst_ops.update_pmtu to control whether we should do neighbor confirm. Also split the big patch to small ones for each area. v2: Remove dst_confirm_neigh in __ip6_rt_update_pmtu. Reported-by: Jianlin Shi Reviewed-by: Guillaume Nault Acked-by: David Ahern Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 3ba69174ad6c..ee968d980746 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1040,7 +1040,7 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, /* TooBig packet may have updated dst->dev's mtu */ if (!t->parms.collect_md && dst && dst_mtu(dst) > dst->dev->mtu) - dst->ops->update_pmtu(dst, NULL, skb, dst->dev->mtu, true); + dst->ops->update_pmtu(dst, NULL, skb, dst->dev->mtu, false); err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, NEXTHDR_GRE); From 6e9105c73f8d2163d12d5dfd762fd75483ed30f5 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Sun, 22 Dec 2019 10:51:11 +0800 Subject: [PATCH 045/146] gtp: do not confirm neighbor when do pmtu update When do IPv6 tunnel PMTU update and calls __ip6_rt_update_pmtu() in the end, we should not call dst_confirm_neigh() as there is no two-way communication. Although GTP only support ipv4 right now, and __ip_rt_update_pmtu() does not call dst_confirm_neigh(), we still set it to false to keep consistency with IPv6 code. v5: No change. v4: No change. v3: Do not remove dst_confirm_neigh, but add a new bool parameter in dst_ops.update_pmtu to control whether we should do neighbor confirm. Also split the big patch to small ones for each area. v2: Remove dst_confirm_neigh in __ip6_rt_update_pmtu. Reviewed-by: Guillaume Nault Acked-by: David Ahern Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- drivers/net/gtp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c index 913062017be9..fca471e27f39 100644 --- a/drivers/net/gtp.c +++ b/drivers/net/gtp.c @@ -540,7 +540,7 @@ static int gtp_build_skb_ip4(struct sk_buff *skb, struct net_device *dev, mtu = dst_mtu(&rt->dst); } - rt->dst.ops->update_pmtu(&rt->dst, NULL, skb, mtu, true); + rt->dst.ops->update_pmtu(&rt->dst, NULL, skb, mtu, false); if (!skb_is_gso(skb) && (iph->frag_off & htons(IP_DF)) && mtu < ntohs(iph->tot_len)) { From 07dc35c6e3cc3c001915d05f5bf21f80a39a0970 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Sun, 22 Dec 2019 10:51:12 +0800 Subject: [PATCH 046/146] net/dst: add new function skb_dst_update_pmtu_no_confirm Add a new function skb_dst_update_pmtu_no_confirm() for callers who need update pmtu but should not do neighbor confirm. v5: No change. v4: No change. v3: Do not remove dst_confirm_neigh, but add a new bool parameter in dst_ops.update_pmtu to control whether we should do neighbor confirm. Also split the big patch to small ones for each area. v2: Remove dst_confirm_neigh in __ip6_rt_update_pmtu. Reviewed-by: Guillaume Nault Acked-by: David Ahern Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- include/net/dst.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/include/net/dst.h b/include/net/dst.h index 593630e0e076..dc7cc1f1051c 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -519,6 +519,15 @@ static inline void skb_dst_update_pmtu(struct sk_buff *skb, u32 mtu) dst->ops->update_pmtu(dst, NULL, skb, mtu, true); } +/* update dst pmtu but not do neighbor confirm */ +static inline void skb_dst_update_pmtu_no_confirm(struct sk_buff *skb, u32 mtu) +{ + struct dst_entry *dst = skb_dst(skb); + + if (dst && dst->ops->update_pmtu) + dst->ops->update_pmtu(dst, NULL, skb, mtu, false); +} + static inline void skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst, int headroom) From 7a1592bcb15d71400a98632727791d1e68ea0ee8 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Sun, 22 Dec 2019 10:51:13 +0800 Subject: [PATCH 047/146] tunnel: do not confirm neighbor when do pmtu update When do tunnel PMTU update and calls __ip6_rt_update_pmtu() in the end, we should not call dst_confirm_neigh() as there is no two-way communication. v5: No Change. v4: Update commit description v3: Do not remove dst_confirm_neigh, but add a new bool parameter in dst_ops.update_pmtu to control whether we should do neighbor confirm. Also split the big patch to small ones for each area. v2: Remove dst_confirm_neigh in __ip6_rt_update_pmtu. Fixes: 0dec879f636f ("net: use dst_confirm_neigh for UDP, RAW, ICMP, L2TP") Reviewed-by: Guillaume Nault Tested-by: Guillaume Nault Acked-by: David Ahern Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel.c | 2 +- net/ipv6/ip6_tunnel.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 38c02bb62e2c..0fe2a5d3e258 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -505,7 +505,7 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; if (skb_valid_dst(skb)) - skb_dst_update_pmtu(skb, mtu); + skb_dst_update_pmtu_no_confirm(skb, mtu); if (skb->protocol == htons(ETH_P_IP)) { if (!skb_is_gso(skb) && diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 754a484d35df..2f376dbc37d5 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -640,7 +640,7 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (rel_info > dst_mtu(skb_dst(skb2))) goto out; - skb_dst_update_pmtu(skb2, rel_info); + skb_dst_update_pmtu_no_confirm(skb2, rel_info); } icmp_send(skb2, rel_type, rel_code, htonl(rel_info)); @@ -1132,7 +1132,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, mtu = max(mtu, skb->protocol == htons(ETH_P_IPV6) ? IPV6_MIN_MTU : IPV4_MIN_MTU); - skb_dst_update_pmtu(skb, mtu); + skb_dst_update_pmtu_no_confirm(skb, mtu); if (skb->len - t->tun_hlen - eth_hlen > mtu && !skb_is_gso(skb)) { *pmtu = mtu; err = -EMSGSIZE; From 8247a79efa2f28b44329f363272550c1738377de Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Sun, 22 Dec 2019 10:51:14 +0800 Subject: [PATCH 048/146] vti: do not confirm neighbor when do pmtu update When do IPv6 tunnel PMTU update and calls __ip6_rt_update_pmtu() in the end, we should not call dst_confirm_neigh() as there is no two-way communication. Although vti and vti6 are immune to this problem because they are IFF_NOARP interfaces, as Guillaume pointed. There is still no sense to confirm neighbour here. v5: Update commit description. v4: No change. v3: Do not remove dst_confirm_neigh, but add a new bool parameter in dst_ops.update_pmtu to control whether we should do neighbor confirm. Also split the big patch to small ones for each area. v2: Remove dst_confirm_neigh in __ip6_rt_update_pmtu. Reviewed-by: Guillaume Nault Acked-by: David Ahern Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- net/ipv4/ip_vti.c | 2 +- net/ipv6/ip6_vti.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 9b153c7fcbb4..e90b600c7a25 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -214,7 +214,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, mtu = dst_mtu(dst); if (skb->len > mtu) { - skb_dst_update_pmtu(skb, mtu); + skb_dst_update_pmtu_no_confirm(skb, mtu); if (skb->protocol == htons(ETH_P_IP)) { icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 024db17386d2..6f08b760c2a7 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -479,7 +479,7 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) mtu = dst_mtu(dst); if (skb->len > mtu) { - skb_dst_update_pmtu(skb, mtu); + skb_dst_update_pmtu_no_confirm(skb, mtu); if (skb->protocol == htons(ETH_P_IPV6)) { if (mtu < IPV6_MIN_MTU) From 4d42df46d6372ece4cb4279870b46c2ea7304a47 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Sun, 22 Dec 2019 10:51:15 +0800 Subject: [PATCH 049/146] sit: do not confirm neighbor when do pmtu update When do IPv6 tunnel PMTU update and calls __ip6_rt_update_pmtu() in the end, we should not call dst_confirm_neigh() as there is no two-way communication. v5: No change. v4: No change. v3: Do not remove dst_confirm_neigh, but add a new bool parameter in dst_ops.update_pmtu to control whether we should do neighbor confirm. Also split the big patch to small ones for each area. v2: Remove dst_confirm_neigh in __ip6_rt_update_pmtu. Reviewed-by: Guillaume Nault Acked-by: David Ahern Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- net/ipv6/sit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index b2ccbc473127..98954830c40b 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -944,7 +944,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, } if (tunnel->parms.iph.daddr) - skb_dst_update_pmtu(skb, mtu); + skb_dst_update_pmtu_no_confirm(skb, mtu); if (skb->len > mtu && !skb_is_gso(skb)) { icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); From f081042d128a0c7acbd67611def62e1b52e2d294 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Sun, 22 Dec 2019 10:51:16 +0800 Subject: [PATCH 050/146] net/dst: do not confirm neighbor for vxlan and geneve pmtu update When do IPv6 tunnel PMTU update and calls __ip6_rt_update_pmtu() in the end, we should not call dst_confirm_neigh() as there is no two-way communication. So disable the neigh confirm for vxlan and geneve pmtu update. v5: No change. v4: No change. v3: Do not remove dst_confirm_neigh, but add a new bool parameter in dst_ops.update_pmtu to control whether we should do neighbor confirm. Also split the big patch to small ones for each area. v2: Remove dst_confirm_neigh in __ip6_rt_update_pmtu. Fixes: a93bf0ff4490 ("vxlan: update skb dst pmtu on tx path") Fixes: 52a589d51f10 ("geneve: update skb dst pmtu on tx path") Reviewed-by: Guillaume Nault Tested-by: Guillaume Nault Acked-by: David Ahern Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- include/net/dst.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/dst.h b/include/net/dst.h index dc7cc1f1051c..3448cf865ede 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -535,7 +535,7 @@ static inline void skb_tunnel_check_pmtu(struct sk_buff *skb, u32 encap_mtu = dst_mtu(encap_dst); if (skb->len > encap_mtu - headroom) - skb_dst_update_pmtu(skb, encap_mtu - headroom); + skb_dst_update_pmtu_no_confirm(skb, encap_mtu - headroom); } #endif /* _NET_DST_H */ From 8b5026bc16938920e4780b9094c3bf20e1e0939d Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Mon, 23 Dec 2019 15:03:21 +0100 Subject: [PATCH 051/146] s390/qeth: fix qdio teardown after early init error qeth_l?_set_online() goes through a number of initialization steps, and on any error uses qeth_l?_stop_card() to tear down the residual state. The first initialization step is qeth_core_hardsetup_card(). When this fails after having established a QDIO context on the device (ie. somewhere after qeth_mpc_initialize()), qeth_l?_stop_card() doesn't shut down this QDIO context again (since the card state hasn't progressed from DOWN at this stage). Even worse, we then call qdio_free() as final teardown step to free the QDIO data structures - while some of them are still hooked into wider QDIO infrastructure such as the IRQ list. This is inevitably followed by use-after-frees and other nastyness. Fix this by unconditionally calling qeth_qdio_clear_card() to shut down the QDIO context, and also to halt/clear any pending activity on the various IO channels. Remove the naive attempt at handling the teardown in qeth_mpc_initialize(), it clearly doesn't suffice and we're handling it properly now in the wider teardown code. Fixes: 4a71df50047f ("qeth: new qeth device driver") Signed-off-by: Julian Wiedmann Signed-off-by: David S. Miller --- drivers/s390/net/qeth_core_main.c | 20 ++++++++------------ drivers/s390/net/qeth_l2_main.c | 2 +- drivers/s390/net/qeth_l3_main.c | 2 +- 3 files changed, 10 insertions(+), 14 deletions(-) diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c index bc4158888af9..324cf22f9111 100644 --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@ -2482,50 +2482,46 @@ static int qeth_mpc_initialize(struct qeth_card *card) rc = qeth_cm_enable(card); if (rc) { QETH_CARD_TEXT_(card, 2, "2err%d", rc); - goto out_qdio; + return rc; } rc = qeth_cm_setup(card); if (rc) { QETH_CARD_TEXT_(card, 2, "3err%d", rc); - goto out_qdio; + return rc; } rc = qeth_ulp_enable(card); if (rc) { QETH_CARD_TEXT_(card, 2, "4err%d", rc); - goto out_qdio; + return rc; } rc = qeth_ulp_setup(card); if (rc) { QETH_CARD_TEXT_(card, 2, "5err%d", rc); - goto out_qdio; + return rc; } rc = qeth_alloc_qdio_queues(card); if (rc) { QETH_CARD_TEXT_(card, 2, "5err%d", rc); - goto out_qdio; + return rc; } rc = qeth_qdio_establish(card); if (rc) { QETH_CARD_TEXT_(card, 2, "6err%d", rc); qeth_free_qdio_queues(card); - goto out_qdio; + return rc; } rc = qeth_qdio_activate(card); if (rc) { QETH_CARD_TEXT_(card, 2, "7err%d", rc); - goto out_qdio; + return rc; } rc = qeth_dm_act(card); if (rc) { QETH_CARD_TEXT_(card, 2, "8err%d", rc); - goto out_qdio; + return rc; } return 0; -out_qdio: - qeth_qdio_clear_card(card, !IS_IQD(card)); - qdio_free(CARD_DDEV(card)); - return rc; } void qeth_print_status_message(struct qeth_card *card) diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c index 8c95e6019bac..15e2fd65d434 100644 --- a/drivers/s390/net/qeth_l2_main.c +++ b/drivers/s390/net/qeth_l2_main.c @@ -287,12 +287,12 @@ static void qeth_l2_stop_card(struct qeth_card *card) card->state = CARD_STATE_HARDSETUP; } if (card->state == CARD_STATE_HARDSETUP) { - qeth_qdio_clear_card(card, 0); qeth_drain_output_queues(card); qeth_clear_working_pool_list(card); card->state = CARD_STATE_DOWN; } + qeth_qdio_clear_card(card, 0); flush_workqueue(card->event_wq); card->info.mac_bits &= ~QETH_LAYER2_MAC_REGISTERED; card->info.promisc_mode = 0; diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c index 04e301de376f..5508ab89b518 100644 --- a/drivers/s390/net/qeth_l3_main.c +++ b/drivers/s390/net/qeth_l3_main.c @@ -1307,12 +1307,12 @@ static void qeth_l3_stop_card(struct qeth_card *card) card->state = CARD_STATE_HARDSETUP; } if (card->state == CARD_STATE_HARDSETUP) { - qeth_qdio_clear_card(card, 0); qeth_drain_output_queues(card); qeth_clear_working_pool_list(card); card->state = CARD_STATE_DOWN; } + qeth_qdio_clear_card(card, 0); flush_workqueue(card->event_wq); card->info.promisc_mode = 0; } From 5b6c7b55cfe26224b0f41b1c226d3534c542787f Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Mon, 23 Dec 2019 15:03:22 +0100 Subject: [PATCH 052/146] s390/qeth: lock the card while changing its hsuid qeth_l3_dev_hsuid_store() initially checks the card state, but doesn't take the conf_mutex to ensure that the card stays in this state while being reconfigured. Rework the code to take this lock, and drop a redundant state check in a helper function. Fixes: b333293058aa ("qeth: add support for af_iucv HiperSockets transport") Signed-off-by: Julian Wiedmann Signed-off-by: David S. Miller --- drivers/s390/net/qeth_core_main.c | 5 ---- drivers/s390/net/qeth_l3_sys.c | 40 +++++++++++++++++++++---------- 2 files changed, 28 insertions(+), 17 deletions(-) diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c index 324cf22f9111..c64ef55f0dff 100644 --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@ -3425,11 +3425,6 @@ int qeth_configure_cq(struct qeth_card *card, enum qeth_cq cq) goto out; } - if (card->state != CARD_STATE_DOWN) { - rc = -1; - goto out; - } - qeth_free_qdio_queues(card); card->options.cq = cq; rc = 0; diff --git a/drivers/s390/net/qeth_l3_sys.c b/drivers/s390/net/qeth_l3_sys.c index f9067ed6c7d3..e8c848f72c6d 100644 --- a/drivers/s390/net/qeth_l3_sys.c +++ b/drivers/s390/net/qeth_l3_sys.c @@ -242,21 +242,33 @@ static ssize_t qeth_l3_dev_hsuid_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct qeth_card *card = dev_get_drvdata(dev); + int rc = 0; char *tmp; - int rc; if (!IS_IQD(card)) return -EPERM; - if (card->state != CARD_STATE_DOWN) - return -EPERM; - if (card->options.sniffer) - return -EPERM; - if (card->options.cq == QETH_CQ_NOTAVAILABLE) - return -EPERM; + + mutex_lock(&card->conf_mutex); + if (card->state != CARD_STATE_DOWN) { + rc = -EPERM; + goto out; + } + + if (card->options.sniffer) { + rc = -EPERM; + goto out; + } + + if (card->options.cq == QETH_CQ_NOTAVAILABLE) { + rc = -EPERM; + goto out; + } tmp = strsep((char **)&buf, "\n"); - if (strlen(tmp) > 8) - return -EINVAL; + if (strlen(tmp) > 8) { + rc = -EINVAL; + goto out; + } if (card->options.hsuid[0]) /* delete old ip address */ @@ -267,11 +279,13 @@ static ssize_t qeth_l3_dev_hsuid_store(struct device *dev, card->options.hsuid[0] = '\0'; memcpy(card->dev->perm_addr, card->options.hsuid, 9); qeth_configure_cq(card, QETH_CQ_DISABLED); - return count; + goto out; } - if (qeth_configure_cq(card, QETH_CQ_ENABLED)) - return -EPERM; + if (qeth_configure_cq(card, QETH_CQ_ENABLED)) { + rc = -EPERM; + goto out; + } snprintf(card->options.hsuid, sizeof(card->options.hsuid), "%-8s", tmp); @@ -280,6 +294,8 @@ static ssize_t qeth_l3_dev_hsuid_store(struct device *dev, rc = qeth_l3_modify_hsuid(card, true); +out: + mutex_unlock(&card->conf_mutex); return rc ? rc : count; } From 68c57bfd52836e31bff33e5e1fc64029749d2c35 Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Mon, 23 Dec 2019 15:03:23 +0100 Subject: [PATCH 053/146] s390/qeth: fix false reporting of VNIC CHAR config failure Symptom: Error message "Configuring the VNIC characteristics failed" in dmesg whenever an OSA interface on z15 is set online. The VNIC characteristics get re-programmed when setting a L2 device online. This follows the selected 'wanted' characteristics - with the exception that the INVISIBLE characteristic unconditionally gets switched off. For devices that don't support INVISIBLE (ie. OSA), the resulting IO failure raises a noisy error message ("Configuring the VNIC characteristics failed"). For IQD, INVISIBLE is off by default anyways. So don't unnecessarily special-case the INVISIBLE characteristic, and thereby suppress the misleading error message on OSA devices. Fixes: caa1f0b10d18 ("s390/qeth: add VNICC enable/disable support") Signed-off-by: Alexandra Winter Reviewed-by: Julian Wiedmann Signed-off-by: Julian Wiedmann Signed-off-by: David S. Miller --- drivers/s390/net/qeth_l2_main.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c index 15e2fd65d434..fc5d8ed3a737 100644 --- a/drivers/s390/net/qeth_l2_main.c +++ b/drivers/s390/net/qeth_l2_main.c @@ -2041,7 +2041,6 @@ static void qeth_l2_vnicc_init(struct qeth_card *card) error |= qeth_l2_vnicc_recover_timeout(card, QETH_VNICC_LEARNING, timeout); chars_tmp = card->options.vnicc.wanted_chars ^ QETH_VNICC_DEFAULT; - chars_tmp |= QETH_VNICC_BRIDGE_INVISIBLE; chars_len = sizeof(card->options.vnicc.wanted_chars) * BITS_PER_BYTE; for_each_set_bit(i, &chars_tmp, chars_len) { vnicc = BIT(i); From e8a66d800471e2df7f0b484e2e46898b21d1fa82 Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Mon, 23 Dec 2019 15:03:24 +0100 Subject: [PATCH 054/146] s390/qeth: Fix vnicc_is_in_use if rx_bcast not set Symptom: After vnicc/rx_bcast has been manually set to 0, bridge_* sysfs parameters can still be set or written. Only occurs on HiperSockets, as OSA doesn't support changing rx_bcast. Vnic characteristics and bridgeport settings are mutually exclusive. rx_bcast defaults to 1, so manually setting it to 0 should disable bridge_* parameters. Instead it makes sense here to check the supported mask. If the card does not support vnicc at all, bridge commands are always allowed. Fixes: caa1f0b10d18 ("s390/qeth: add VNICC enable/disable support") Signed-off-by: Alexandra Winter Signed-off-by: Julian Wiedmann Signed-off-by: David S. Miller --- drivers/s390/net/qeth_l2_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c index fc5d8ed3a737..8024a2112a87 100644 --- a/drivers/s390/net/qeth_l2_main.c +++ b/drivers/s390/net/qeth_l2_main.c @@ -1952,8 +1952,7 @@ int qeth_l2_vnicc_get_timeout(struct qeth_card *card, u32 *timeout) /* check if VNICC is currently enabled */ bool qeth_l2_vnicc_is_in_use(struct qeth_card *card) { - /* if everything is turned off, VNICC is not active */ - if (!card->options.vnicc.cur_chars) + if (!card->options.vnicc.sup_chars) return false; /* default values are only OK if rx_bcast was not enabled by user * or the card is offline. From d1b9ae1864fc3c000e0eb4af8482d78c63e0915a Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Mon, 23 Dec 2019 15:03:25 +0100 Subject: [PATCH 055/146] s390/qeth: vnicc Fix init to default During vnicc_init wanted_char should be compared to cur_char and not to QETH_VNICC_DEFAULT. Without this patch there is no way to enforce the default values as desired values. Note, that it is expected, that a card comes online with default values. This patch was tested with private card firmware. Fixes: caa1f0b10d18 ("s390/qeth: add VNICC enable/disable support") Signed-off-by: Alexandra Winter Signed-off-by: Julian Wiedmann Signed-off-by: David S. Miller --- drivers/s390/net/qeth_l2_main.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c index 8024a2112a87..47d37e75dda6 100644 --- a/drivers/s390/net/qeth_l2_main.c +++ b/drivers/s390/net/qeth_l2_main.c @@ -2039,7 +2039,9 @@ static void qeth_l2_vnicc_init(struct qeth_card *card) /* enforce assumed default values and recover settings, if changed */ error |= qeth_l2_vnicc_recover_timeout(card, QETH_VNICC_LEARNING, timeout); - chars_tmp = card->options.vnicc.wanted_chars ^ QETH_VNICC_DEFAULT; + /* Change chars, if necessary */ + chars_tmp = card->options.vnicc.wanted_chars ^ + card->options.vnicc.cur_chars; chars_len = sizeof(card->options.vnicc.wanted_chars) * BITS_PER_BYTE; for_each_set_bit(i, &chars_tmp, chars_len) { vnicc = BIT(i); From 0b698c838e84149b690c7e979f78cccb6f8aa4b9 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Mon, 23 Dec 2019 15:03:26 +0100 Subject: [PATCH 056/146] s390/qeth: fix initialization on old HW I stumbled over an old OSA model that claims to support DIAG_ASSIST, but then rejects the cmd to query its DIAG capabilities. In the old code this was ok, as the returned raw error code was > 0. Now that we translate the raw codes to errnos, the "rc < 0" causes us to fail the initialization of the device. The fix is trivial: don't bail out when the DIAG query fails. Such an error is not critical, we can still use the device (with a slightly reduced set of features). Fixes: 742d4d40831d ("s390/qeth: convert remaining legacy cmd callbacks") Signed-off-by: Julian Wiedmann Signed-off-by: David S. Miller --- drivers/s390/net/qeth_core_main.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c index c64ef55f0dff..29facb913671 100644 --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@ -5026,10 +5026,8 @@ int qeth_core_hardsetup_card(struct qeth_card *card, bool *carrier_ok) } if (qeth_adp_supported(card, IPA_SETADP_SET_DIAG_ASSIST)) { rc = qeth_query_setdiagass(card); - if (rc < 0) { + if (rc) QETH_CARD_TEXT_(card, 2, "8err%d", rc); - goto out; - } } if (!qeth_is_diagass_supported(card, QETH_DIAGS_CMD_TRAP) || From 57b948e2c9ce58e4fcb687bc285b5b5a2cc66640 Mon Sep 17 00:00:00 2001 From: Netanel Belgazal Date: Sun, 22 Dec 2019 09:47:59 +0000 Subject: [PATCH 057/146] MAINTAINERS: Add additional maintainers to ENA Ethernet driver Signed-off-by: Netanel Belgazal Signed-off-by: David S. Miller --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index ffa3371bc750..992474b930bf 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -771,6 +771,8 @@ F: drivers/thermal/thermal_mmio.c AMAZON ETHERNET DRIVERS M: Netanel Belgazal +M: Arthur Kiyanovski +R: Guy Tzalik R: Saeed Bishara R: Zorik Machulsky L: netdev@vger.kernel.org From 84bb59d773853bc2dda2ac1ef8474c40eb33a3c6 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sun, 22 Dec 2019 11:25:27 +0000 Subject: [PATCH 058/146] hsr: avoid debugfs warning message when module is remove When hsr module is being removed, debugfs_remove() is called to remove both debugfs directory and file. When module is being removed, module state is changed to MODULE_STATE_GOING then exit() is called. At this moment, module couldn't be held so try_module_get() will be failed. debugfs's open() callback tries to hold the module if .owner is existing. If it fails, warning message is printed. CPU0 CPU1 delete_module() try_stop_module() hsr_exit() open() <-- WARNING debugfs_remove() In order to avoid the warning message, this patch makes hsr module does not set .owner. Unsetting .owner is safe because these are protected by inode_lock(). Test commands: #SHELL1 ip link add dummy0 type dummy ip link add dummy1 type dummy while : do ip link add hsr0 type hsr slave1 dummy0 slave2 dummy1 modprobe -rv hsr done #SHELL2 while : do cat /sys/kernel/debug/hsr0/node_table done Splat looks like: [ 101.223783][ T1271] ------------[ cut here ]------------ [ 101.230309][ T1271] debugfs file owner did not clean up at exit: node_table [ 101.230380][ T1271] WARNING: CPU: 3 PID: 1271 at fs/debugfs/file.c:309 full_proxy_open+0x10f/0x650 [ 101.233153][ T1271] Modules linked in: hsr(-) dummy veth openvswitch nsh nf_conncount nf_nat nf_conntrack nf_d] [ 101.237112][ T1271] CPU: 3 PID: 1271 Comm: cat Tainted: G W 5.5.0-rc1+ #204 [ 101.238270][ T1271] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 [ 101.240379][ T1271] RIP: 0010:full_proxy_open+0x10f/0x650 [ 101.241166][ T1271] Code: 48 c1 ea 03 80 3c 02 00 0f 85 c1 04 00 00 49 8b 3c 24 e8 04 86 7e ff 84 c0 75 2d 4c 8 [ 101.251985][ T1271] RSP: 0018:ffff8880ca22fa38 EFLAGS: 00010286 [ 101.273355][ T1271] RAX: dffffc0000000008 RBX: ffff8880cc6e6200 RCX: 0000000000000000 [ 101.274466][ T1271] RDX: 0000000000000000 RSI: 0000000000000006 RDI: ffff8880c4dd5c14 [ 101.275581][ T1271] RBP: 0000000000000000 R08: fffffbfff2922f5d R09: 0000000000000000 [ 101.276733][ T1271] R10: 0000000000000001 R11: 0000000000000000 R12: ffffffffc0551bc0 [ 101.277853][ T1271] R13: ffff8880c4059a48 R14: ffff8880be50a5e0 R15: ffffffff941adaa0 [ 101.278956][ T1271] FS: 00007f8871cda540(0000) GS:ffff8880da800000(0000) knlGS:0000000000000000 [ 101.280216][ T1271] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 101.282832][ T1271] CR2: 00007f88717cfd10 CR3: 00000000b9440005 CR4: 00000000000606e0 [ 101.283974][ T1271] Call Trace: [ 101.285328][ T1271] do_dentry_open+0x63c/0xf50 [ 101.286077][ T1271] ? open_proxy_open+0x270/0x270 [ 101.288271][ T1271] ? __x64_sys_fchdir+0x180/0x180 [ 101.288987][ T1271] ? inode_permission+0x65/0x390 [ 101.289682][ T1271] path_openat+0x701/0x2810 [ 101.290294][ T1271] ? path_lookupat+0x880/0x880 [ 101.290957][ T1271] ? check_chain_key+0x236/0x5d0 [ 101.291676][ T1271] ? __lock_acquire+0xdfe/0x3de0 [ 101.292358][ T1271] ? sched_clock+0x5/0x10 [ 101.292962][ T1271] ? sched_clock_cpu+0x18/0x170 [ 101.293644][ T1271] ? find_held_lock+0x39/0x1d0 [ 101.305616][ T1271] do_filp_open+0x17a/0x270 [ 101.306061][ T1271] ? may_open_dev+0xc0/0xc0 [ ... ] Fixes: fc4ecaeebd26 ("net: hsr: add debugfs support for display node list") Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- net/hsr/hsr_debugfs.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/hsr/hsr_debugfs.c b/net/hsr/hsr_debugfs.c index 94447974a3c0..6135706f03d5 100644 --- a/net/hsr/hsr_debugfs.c +++ b/net/hsr/hsr_debugfs.c @@ -64,7 +64,6 @@ hsr_node_table_open(struct inode *inode, struct file *filp) } static const struct file_operations hsr_fops = { - .owner = THIS_MODULE, .open = hsr_node_table_open, .read = seq_read, .llseek = seq_lseek, From 1d19e2d53e8ed9e4c98fc95e0067492cda7288b0 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sun, 22 Dec 2019 11:26:15 +0000 Subject: [PATCH 059/146] hsr: fix error handling routine in hsr_dev_finalize() hsr_dev_finalize() is called to create new hsr interface. There are some wrong error handling codes. 1. wrong checking return value of debugfs_create_{dir/file}. These function doesn't return NULL. If error occurs in there, it returns error pointer. So, it should check error pointer instead of NULL. 2. It doesn't unregister interface if it fails to setup hsr interface. If it fails to initialize hsr interface after register_netdevice(), it should call unregister_netdevice(). 3. Ignore failure of creation of debugfs If creating of debugfs dir and file is failed, creating hsr interface will be failed. But debugfs doesn't affect actual logic of hsr module. So, ignoring this is more correct and this behavior is more general. Fixes: c5a759117210 ("net/hsr: Use list_head (and rcu) instead of array for slave devices.") Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- net/hsr/hsr_debugfs.c | 15 +++++++-------- net/hsr/hsr_device.c | 19 ++++++++++--------- net/hsr/hsr_main.h | 11 ++++------- 3 files changed, 21 insertions(+), 24 deletions(-) diff --git a/net/hsr/hsr_debugfs.c b/net/hsr/hsr_debugfs.c index 6135706f03d5..6618a9d8e58e 100644 --- a/net/hsr/hsr_debugfs.c +++ b/net/hsr/hsr_debugfs.c @@ -77,15 +77,14 @@ static const struct file_operations hsr_fops = { * When debugfs is configured this routine sets up the node_table file per * hsr device for dumping the node_table entries */ -int hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev) +void hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev) { - int rc = -1; struct dentry *de = NULL; de = debugfs_create_dir(hsr_dev->name, NULL); - if (!de) { + if (IS_ERR(de)) { pr_err("Cannot create hsr debugfs root\n"); - return rc; + return; } priv->node_tbl_root = de; @@ -93,13 +92,13 @@ int hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev) de = debugfs_create_file("node_table", S_IFREG | 0444, priv->node_tbl_root, priv, &hsr_fops); - if (!de) { + if (IS_ERR(de)) { pr_err("Cannot create hsr node_table directory\n"); - return rc; + debugfs_remove(priv->node_tbl_root); + priv->node_tbl_root = NULL; + return; } priv->node_tbl_file = de; - - return 0; } /* hsr_debugfs_term - Tear down debugfs intrastructure diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index b01e1bae4ddc..e73549075a03 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -477,30 +477,31 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], res = hsr_add_port(hsr, hsr_dev, HSR_PT_MASTER); if (res) - goto err_add_port; + goto err_add_master; res = register_netdevice(hsr_dev); if (res) - goto fail; + goto err_unregister; res = hsr_add_port(hsr, slave[0], HSR_PT_SLAVE_A); if (res) - goto fail; + goto err_add_slaves; + res = hsr_add_port(hsr, slave[1], HSR_PT_SLAVE_B); if (res) - goto fail; + goto err_add_slaves; + hsr_debugfs_init(hsr, hsr_dev); mod_timer(&hsr->prune_timer, jiffies + msecs_to_jiffies(PRUNE_PERIOD)); - res = hsr_debugfs_init(hsr, hsr_dev); - if (res) - goto fail; return 0; -fail: +err_add_slaves: + unregister_netdevice(hsr_dev); +err_unregister: list_for_each_entry_safe(port, tmp, &hsr->ports, port_list) hsr_del_port(port); -err_add_port: +err_add_master: hsr_del_self_node(&hsr->self_node_db); return res; diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index 96fac696a1e1..acab9c353a49 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -184,15 +184,12 @@ static inline u16 hsr_get_skb_sequence_nr(struct sk_buff *skb) } #if IS_ENABLED(CONFIG_DEBUG_FS) -int hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev); +void hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev); void hsr_debugfs_term(struct hsr_priv *priv); #else -static inline int hsr_debugfs_init(struct hsr_priv *priv, - struct net_device *hsr_dev) -{ - return 0; -} - +static inline void hsr_debugfs_init(struct hsr_priv *priv, + struct net_device *hsr_dev) +{} static inline void hsr_debugfs_term(struct hsr_priv *priv) {} #endif From c6c4ccd7f96993e106dfea7ef18127f972f2db5e Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sun, 22 Dec 2019 11:26:27 +0000 Subject: [PATCH 060/146] hsr: add hsr root debugfs directory In current hsr code, when hsr interface is created, it creates debugfs directory /sys/kernel/debug/. If there is same directory or file name in there, it fails. In order to reduce possibility of failure of creation of debugfs, this patch adds root directory. Test commands: ip link add dummy0 type dummy ip link add dummy1 type dummy ip link add hsr0 type hsr slave1 dummy0 slave2 dummy1 Before this patch: /sys/kernel/debug/hsr0/node_table After this patch: /sys/kernel/debug/hsr/hsr0/node_table Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- net/hsr/hsr_debugfs.c | 23 ++++++++++++++++++++--- net/hsr/hsr_main.c | 1 + net/hsr/hsr_main.h | 6 ++++++ net/hsr/hsr_netlink.c | 1 + 4 files changed, 28 insertions(+), 3 deletions(-) diff --git a/net/hsr/hsr_debugfs.c b/net/hsr/hsr_debugfs.c index 6618a9d8e58e..a7462a718e7b 100644 --- a/net/hsr/hsr_debugfs.c +++ b/net/hsr/hsr_debugfs.c @@ -20,6 +20,8 @@ #include "hsr_main.h" #include "hsr_framereg.h" +static struct dentry *hsr_debugfs_root_dir; + static void print_mac_address(struct seq_file *sfp, unsigned char *mac) { seq_printf(sfp, "%02x:%02x:%02x:%02x:%02x:%02x:", @@ -81,9 +83,9 @@ void hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev) { struct dentry *de = NULL; - de = debugfs_create_dir(hsr_dev->name, NULL); + de = debugfs_create_dir(hsr_dev->name, hsr_debugfs_root_dir); if (IS_ERR(de)) { - pr_err("Cannot create hsr debugfs root\n"); + pr_err("Cannot create hsr debugfs directory\n"); return; } @@ -93,7 +95,7 @@ void hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev) priv->node_tbl_root, priv, &hsr_fops); if (IS_ERR(de)) { - pr_err("Cannot create hsr node_table directory\n"); + pr_err("Cannot create hsr node_table file\n"); debugfs_remove(priv->node_tbl_root); priv->node_tbl_root = NULL; return; @@ -115,3 +117,18 @@ hsr_debugfs_term(struct hsr_priv *priv) debugfs_remove(priv->node_tbl_root); priv->node_tbl_root = NULL; } + +void hsr_debugfs_create_root(void) +{ + hsr_debugfs_root_dir = debugfs_create_dir("hsr", NULL); + if (IS_ERR(hsr_debugfs_root_dir)) { + pr_err("Cannot create hsr debugfs root directory\n"); + hsr_debugfs_root_dir = NULL; + } +} + +void hsr_debugfs_remove_root(void) +{ + /* debugfs_remove() internally checks NULL and ERROR */ + debugfs_remove(hsr_debugfs_root_dir); +} diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c index b9988a662ee1..490896379073 100644 --- a/net/hsr/hsr_main.c +++ b/net/hsr/hsr_main.c @@ -123,6 +123,7 @@ static void __exit hsr_exit(void) { unregister_netdevice_notifier(&hsr_nb); hsr_netlink_exit(); + hsr_debugfs_remove_root(); } module_init(hsr_init); diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index acab9c353a49..55d2057bf749 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -186,12 +186,18 @@ static inline u16 hsr_get_skb_sequence_nr(struct sk_buff *skb) #if IS_ENABLED(CONFIG_DEBUG_FS) void hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev); void hsr_debugfs_term(struct hsr_priv *priv); +void hsr_debugfs_create_root(void); +void hsr_debugfs_remove_root(void); #else static inline void hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev) {} static inline void hsr_debugfs_term(struct hsr_priv *priv) {} +static inline void hsr_debugfs_create_root(void) +{} +static inline void hsr_debugfs_remove_root(void) +{} #endif #endif /* __HSR_PRIVATE_H */ diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c index 8f8337f893ba..8dc0547f01d0 100644 --- a/net/hsr/hsr_netlink.c +++ b/net/hsr/hsr_netlink.c @@ -476,6 +476,7 @@ int __init hsr_netlink_init(void) if (rc) goto fail_genl_register_family; + hsr_debugfs_create_root(); return 0; fail_genl_register_family: From 4c2d5e33dcd3a6333a7895be3b542ff3d373177c Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sun, 22 Dec 2019 11:26:39 +0000 Subject: [PATCH 061/146] hsr: rename debugfs file when interface name is changed hsr interface has own debugfs file, which name is same with interface name. So, interface name is changed, debugfs file name should be changed too. Fixes: fc4ecaeebd26 ("net: hsr: add debugfs support for display node list") Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- net/hsr/hsr_debugfs.c | 13 +++++++++++++ net/hsr/hsr_main.c | 3 +++ net/hsr/hsr_main.h | 4 ++++ 3 files changed, 20 insertions(+) diff --git a/net/hsr/hsr_debugfs.c b/net/hsr/hsr_debugfs.c index a7462a718e7b..d5f709b940ff 100644 --- a/net/hsr/hsr_debugfs.c +++ b/net/hsr/hsr_debugfs.c @@ -65,6 +65,19 @@ hsr_node_table_open(struct inode *inode, struct file *filp) return single_open(filp, hsr_node_table_show, inode->i_private); } +void hsr_debugfs_rename(struct net_device *dev) +{ + struct hsr_priv *priv = netdev_priv(dev); + struct dentry *d; + + d = debugfs_rename(hsr_debugfs_root_dir, priv->node_tbl_root, + hsr_debugfs_root_dir, dev->name); + if (IS_ERR(d)) + netdev_warn(dev, "failed to rename\n"); + else + priv->node_tbl_root = d; +} + static const struct file_operations hsr_fops = { .open = hsr_node_table_open, .read = seq_read, diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c index 490896379073..ea23eb7408e4 100644 --- a/net/hsr/hsr_main.c +++ b/net/hsr/hsr_main.c @@ -45,6 +45,9 @@ static int hsr_netdev_notify(struct notifier_block *nb, unsigned long event, case NETDEV_CHANGE: /* Link (carrier) state changes */ hsr_check_carrier_and_operstate(hsr); break; + case NETDEV_CHANGENAME: + hsr_debugfs_rename(dev); + break; case NETDEV_CHANGEADDR: if (port->type == HSR_PT_MASTER) { /* This should not happen since there's no diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index 55d2057bf749..8d885bc6a54d 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -184,11 +184,15 @@ static inline u16 hsr_get_skb_sequence_nr(struct sk_buff *skb) } #if IS_ENABLED(CONFIG_DEBUG_FS) +void hsr_debugfs_rename(struct net_device *dev); void hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev); void hsr_debugfs_term(struct hsr_priv *priv); void hsr_debugfs_create_root(void); void hsr_debugfs_remove_root(void); #else +static inline void void hsr_debugfs_rename(struct net_device *dev) +{ +} static inline void hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev) {} From 92a35678ec075100ce666a2fb6969151affb0e5d Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sun, 22 Dec 2019 11:26:54 +0000 Subject: [PATCH 062/146] hsr: fix a race condition in node list insertion and deletion hsr nodes are protected by RCU and there is no write side lock. But node insertions and deletions could be being operated concurrently. So write side locking is needed. Test commands: ip netns add nst ip link add veth0 type veth peer name veth1 ip link add veth2 type veth peer name veth3 ip link set veth1 netns nst ip link set veth3 netns nst ip link set veth0 up ip link set veth2 up ip link add hsr0 type hsr slave1 veth0 slave2 veth2 ip a a 192.168.100.1/24 dev hsr0 ip link set hsr0 up ip netns exec nst ip link set veth1 up ip netns exec nst ip link set veth3 up ip netns exec nst ip link add hsr1 type hsr slave1 veth1 slave2 veth3 ip netns exec nst ip a a 192.168.100.2/24 dev hsr1 ip netns exec nst ip link set hsr1 up for i in {0..9} do for j in {0..9} do for k in {0..9} do for l in {0..9} do arping 192.168.100.2 -I hsr0 -s 00:01:3$i:4$j:5$k:6$l -c1 & done done done done Splat looks like: [ 236.066091][ T3286] list_add corruption. next->prev should be prev (ffff8880a5940300), but was ffff8880a5940d0. [ 236.069617][ T3286] ------------[ cut here ]------------ [ 236.070545][ T3286] kernel BUG at lib/list_debug.c:25! [ 236.071391][ T3286] invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC KASAN PTI [ 236.072343][ T3286] CPU: 0 PID: 3286 Comm: arping Tainted: G W 5.5.0-rc1+ #209 [ 236.073463][ T3286] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 [ 236.074695][ T3286] RIP: 0010:__list_add_valid+0x74/0xd0 [ 236.075499][ T3286] Code: 48 39 da 75 27 48 39 f5 74 36 48 39 dd 74 31 48 83 c4 08 b8 01 00 00 00 5b 5d c3 48 b [ 236.078277][ T3286] RSP: 0018:ffff8880aaa97648 EFLAGS: 00010286 [ 236.086991][ T3286] RAX: 0000000000000075 RBX: ffff8880d4624c20 RCX: 0000000000000000 [ 236.088000][ T3286] RDX: 0000000000000075 RSI: 0000000000000008 RDI: ffffed1015552ebf [ 236.098897][ T3286] RBP: ffff88809b53d200 R08: ffffed101b3c04f9 R09: ffffed101b3c04f9 [ 236.099960][ T3286] R10: 00000000308769a1 R11: ffffed101b3c04f8 R12: ffff8880d4624c28 [ 236.100974][ T3286] R13: ffff8880d4624c20 R14: 0000000040310100 R15: ffff8880ce17ee02 [ 236.138967][ T3286] FS: 00007f23479fa680(0000) GS:ffff8880d9c00000(0000) knlGS:0000000000000000 [ 236.144852][ T3286] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 236.145720][ T3286] CR2: 00007f4a14bab210 CR3: 00000000a61c6001 CR4: 00000000000606f0 [ 236.146776][ T3286] Call Trace: [ 236.147222][ T3286] hsr_add_node+0x314/0x490 [hsr] [ 236.153633][ T3286] hsr_forward_skb+0x2b6/0x1bc0 [hsr] [ 236.154362][ T3286] ? rcu_read_lock_sched_held+0x90/0xc0 [ 236.155091][ T3286] ? rcu_read_lock_bh_held+0xa0/0xa0 [ 236.156607][ T3286] hsr_dev_xmit+0x70/0xd0 [hsr] [ 236.157254][ T3286] dev_hard_start_xmit+0x160/0x740 [ 236.157941][ T3286] __dev_queue_xmit+0x1961/0x2e10 [ 236.158565][ T3286] ? netdev_core_pick_tx+0x2e0/0x2e0 [ ... ] Reported-by: syzbot+3924327f9ad5f4d2b343@syzkaller.appspotmail.com Fixes: f421436a591d ("net/hsr: Add support for the High-availability Seamless Redundancy protocol (HSRv0)") Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- net/hsr/hsr_device.c | 7 ++-- net/hsr/hsr_framereg.c | 73 ++++++++++++++++++++++++++---------------- net/hsr/hsr_framereg.h | 6 ++-- net/hsr/hsr_main.c | 2 +- net/hsr/hsr_main.h | 5 +-- 5 files changed, 56 insertions(+), 37 deletions(-) diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index e73549075a03..62c03f0d0079 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -368,7 +368,7 @@ static void hsr_dev_destroy(struct net_device *hsr_dev) del_timer_sync(&hsr->prune_timer); del_timer_sync(&hsr->announce_timer); - hsr_del_self_node(&hsr->self_node_db); + hsr_del_self_node(hsr); hsr_del_nodes(&hsr->node_db); } @@ -440,11 +440,12 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], INIT_LIST_HEAD(&hsr->ports); INIT_LIST_HEAD(&hsr->node_db); INIT_LIST_HEAD(&hsr->self_node_db); + spin_lock_init(&hsr->list_lock); ether_addr_copy(hsr_dev->dev_addr, slave[0]->dev_addr); /* Make sure we recognize frames from ourselves in hsr_rcv() */ - res = hsr_create_self_node(&hsr->self_node_db, hsr_dev->dev_addr, + res = hsr_create_self_node(hsr, hsr_dev->dev_addr, slave[1]->dev_addr); if (res < 0) return res; @@ -502,7 +503,7 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], list_for_each_entry_safe(port, tmp, &hsr->ports, port_list) hsr_del_port(port); err_add_master: - hsr_del_self_node(&hsr->self_node_db); + hsr_del_self_node(hsr); return res; } diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index 292be446007b..27dc65d7de67 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -75,10 +75,11 @@ static struct hsr_node *find_node_by_addr_A(struct list_head *node_db, /* Helper for device init; the self_node_db is used in hsr_rcv() to recognize * frames from self that's been looped over the HSR ring. */ -int hsr_create_self_node(struct list_head *self_node_db, +int hsr_create_self_node(struct hsr_priv *hsr, unsigned char addr_a[ETH_ALEN], unsigned char addr_b[ETH_ALEN]) { + struct list_head *self_node_db = &hsr->self_node_db; struct hsr_node *node, *oldnode; node = kmalloc(sizeof(*node), GFP_KERNEL); @@ -88,33 +89,33 @@ int hsr_create_self_node(struct list_head *self_node_db, ether_addr_copy(node->macaddress_A, addr_a); ether_addr_copy(node->macaddress_B, addr_b); - rcu_read_lock(); + spin_lock_bh(&hsr->list_lock); oldnode = list_first_or_null_rcu(self_node_db, struct hsr_node, mac_list); if (oldnode) { list_replace_rcu(&oldnode->mac_list, &node->mac_list); - rcu_read_unlock(); - synchronize_rcu(); - kfree(oldnode); + spin_unlock_bh(&hsr->list_lock); + kfree_rcu(oldnode, rcu_head); } else { - rcu_read_unlock(); list_add_tail_rcu(&node->mac_list, self_node_db); + spin_unlock_bh(&hsr->list_lock); } return 0; } -void hsr_del_self_node(struct list_head *self_node_db) +void hsr_del_self_node(struct hsr_priv *hsr) { + struct list_head *self_node_db = &hsr->self_node_db; struct hsr_node *node; - rcu_read_lock(); + spin_lock_bh(&hsr->list_lock); node = list_first_or_null_rcu(self_node_db, struct hsr_node, mac_list); - rcu_read_unlock(); if (node) { list_del_rcu(&node->mac_list); - kfree(node); + kfree_rcu(node, rcu_head); } + spin_unlock_bh(&hsr->list_lock); } void hsr_del_nodes(struct list_head *node_db) @@ -130,30 +131,43 @@ void hsr_del_nodes(struct list_head *node_db) * seq_out is used to initialize filtering of outgoing duplicate frames * originating from the newly added node. */ -struct hsr_node *hsr_add_node(struct list_head *node_db, unsigned char addr[], - u16 seq_out) +static struct hsr_node *hsr_add_node(struct hsr_priv *hsr, + struct list_head *node_db, + unsigned char addr[], + u16 seq_out) { - struct hsr_node *node; + struct hsr_node *new_node, *node; unsigned long now; int i; - node = kzalloc(sizeof(*node), GFP_ATOMIC); - if (!node) + new_node = kzalloc(sizeof(*new_node), GFP_ATOMIC); + if (!new_node) return NULL; - ether_addr_copy(node->macaddress_A, addr); + ether_addr_copy(new_node->macaddress_A, addr); /* We are only interested in time diffs here, so use current jiffies * as initialization. (0 could trigger an spurious ring error warning). */ now = jiffies; for (i = 0; i < HSR_PT_PORTS; i++) - node->time_in[i] = now; + new_node->time_in[i] = now; for (i = 0; i < HSR_PT_PORTS; i++) - node->seq_out[i] = seq_out; - - list_add_tail_rcu(&node->mac_list, node_db); + new_node->seq_out[i] = seq_out; + spin_lock_bh(&hsr->list_lock); + list_for_each_entry_rcu(node, node_db, mac_list) { + if (ether_addr_equal(node->macaddress_A, addr)) + goto out; + if (ether_addr_equal(node->macaddress_B, addr)) + goto out; + } + list_add_tail_rcu(&new_node->mac_list, node_db); + spin_unlock_bh(&hsr->list_lock); + return new_node; +out: + spin_unlock_bh(&hsr->list_lock); + kfree(new_node); return node; } @@ -163,6 +177,7 @@ struct hsr_node *hsr_get_node(struct hsr_port *port, struct sk_buff *skb, bool is_sup) { struct list_head *node_db = &port->hsr->node_db; + struct hsr_priv *hsr = port->hsr; struct hsr_node *node; struct ethhdr *ethhdr; u16 seq_out; @@ -196,7 +211,7 @@ struct hsr_node *hsr_get_node(struct hsr_port *port, struct sk_buff *skb, seq_out = HSR_SEQNR_START; } - return hsr_add_node(node_db, ethhdr->h_source, seq_out); + return hsr_add_node(hsr, node_db, ethhdr->h_source, seq_out); } /* Use the Supervision frame's info about an eventual macaddress_B for merging @@ -206,10 +221,11 @@ struct hsr_node *hsr_get_node(struct hsr_port *port, struct sk_buff *skb, void hsr_handle_sup_frame(struct sk_buff *skb, struct hsr_node *node_curr, struct hsr_port *port_rcv) { - struct ethhdr *ethhdr; - struct hsr_node *node_real; + struct hsr_priv *hsr = port_rcv->hsr; struct hsr_sup_payload *hsr_sp; + struct hsr_node *node_real; struct list_head *node_db; + struct ethhdr *ethhdr; int i; ethhdr = (struct ethhdr *)skb_mac_header(skb); @@ -231,7 +247,7 @@ void hsr_handle_sup_frame(struct sk_buff *skb, struct hsr_node *node_curr, node_real = find_node_by_addr_A(node_db, hsr_sp->macaddress_A); if (!node_real) /* No frame received from AddrA of this node yet */ - node_real = hsr_add_node(node_db, hsr_sp->macaddress_A, + node_real = hsr_add_node(hsr, node_db, hsr_sp->macaddress_A, HSR_SEQNR_START - 1); if (!node_real) goto done; /* No mem */ @@ -252,7 +268,9 @@ void hsr_handle_sup_frame(struct sk_buff *skb, struct hsr_node *node_curr, } node_real->addr_B_port = port_rcv->type; + spin_lock_bh(&hsr->list_lock); list_del_rcu(&node_curr->mac_list); + spin_unlock_bh(&hsr->list_lock); kfree_rcu(node_curr, rcu_head); done: @@ -368,12 +386,13 @@ void hsr_prune_nodes(struct timer_list *t) { struct hsr_priv *hsr = from_timer(hsr, t, prune_timer); struct hsr_node *node; + struct hsr_node *tmp; struct hsr_port *port; unsigned long timestamp; unsigned long time_a, time_b; - rcu_read_lock(); - list_for_each_entry_rcu(node, &hsr->node_db, mac_list) { + spin_lock_bh(&hsr->list_lock); + list_for_each_entry_safe(node, tmp, &hsr->node_db, mac_list) { /* Don't prune own node. Neither time_in[HSR_PT_SLAVE_A] * nor time_in[HSR_PT_SLAVE_B], will ever be updated for * the master port. Thus the master node will be repeatedly @@ -421,7 +440,7 @@ void hsr_prune_nodes(struct timer_list *t) kfree_rcu(node, rcu_head); } } - rcu_read_unlock(); + spin_unlock_bh(&hsr->list_lock); /* Restart timer */ mod_timer(&hsr->prune_timer, diff --git a/net/hsr/hsr_framereg.h b/net/hsr/hsr_framereg.h index 89a3ce38151d..0f0fa12b4329 100644 --- a/net/hsr/hsr_framereg.h +++ b/net/hsr/hsr_framereg.h @@ -12,10 +12,8 @@ struct hsr_node; -void hsr_del_self_node(struct list_head *self_node_db); +void hsr_del_self_node(struct hsr_priv *hsr); void hsr_del_nodes(struct list_head *node_db); -struct hsr_node *hsr_add_node(struct list_head *node_db, unsigned char addr[], - u16 seq_out); struct hsr_node *hsr_get_node(struct hsr_port *port, struct sk_buff *skb, bool is_sup); void hsr_handle_sup_frame(struct sk_buff *skb, struct hsr_node *node_curr, @@ -33,7 +31,7 @@ int hsr_register_frame_out(struct hsr_port *port, struct hsr_node *node, void hsr_prune_nodes(struct timer_list *t); -int hsr_create_self_node(struct list_head *self_node_db, +int hsr_create_self_node(struct hsr_priv *hsr, unsigned char addr_a[ETH_ALEN], unsigned char addr_b[ETH_ALEN]); diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c index ea23eb7408e4..d2ee7125a7f1 100644 --- a/net/hsr/hsr_main.c +++ b/net/hsr/hsr_main.c @@ -67,7 +67,7 @@ static int hsr_netdev_notify(struct notifier_block *nb, unsigned long event, /* Make sure we recognize frames from ourselves in hsr_rcv() */ port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B); - res = hsr_create_self_node(&hsr->self_node_db, + res = hsr_create_self_node(hsr, master->dev->dev_addr, port ? port->dev->dev_addr : diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index 8d885bc6a54d..d40de84a637f 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -160,8 +160,9 @@ struct hsr_priv { int announce_count; u16 sequence_nr; u16 sup_sequence_nr; /* For HSRv1 separate seq_nr for supervision */ - u8 prot_version; /* Indicate if HSRv0 or HSRv1. */ - spinlock_t seqnr_lock; /* locking for sequence_nr */ + u8 prot_version; /* Indicate if HSRv0 or HSRv1. */ + spinlock_t seqnr_lock; /* locking for sequence_nr */ + spinlock_t list_lock; /* locking for node list */ unsigned char sup_multicast_addr[ETH_ALEN]; #ifdef CONFIG_DEBUG_FS struct dentry *node_tbl_root; From 3ed0a1d563903bdb4b4c36c58c4d9c1bcb23a6e6 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sun, 22 Dec 2019 11:27:08 +0000 Subject: [PATCH 063/146] hsr: reset network header when supervision frame is created The supervision frame is L2 frame. When supervision frame is created, hsr module doesn't set network header. If tap routine is enabled, dev_queue_xmit_nit() is called and it checks network_header. If network_header pointer wasn't set(or invalid), it resets network_header and warns. In order to avoid unnecessary warning message, resetting network_header is needed. Test commands: ip netns add nst ip link add veth0 type veth peer name veth1 ip link add veth2 type veth peer name veth3 ip link set veth1 netns nst ip link set veth3 netns nst ip link set veth0 up ip link set veth2 up ip link add hsr0 type hsr slave1 veth0 slave2 veth2 ip a a 192.168.100.1/24 dev hsr0 ip link set hsr0 up ip netns exec nst ip link set veth1 up ip netns exec nst ip link set veth3 up ip netns exec nst ip link add hsr1 type hsr slave1 veth1 slave2 veth3 ip netns exec nst ip a a 192.168.100.2/24 dev hsr1 ip netns exec nst ip link set hsr1 up tcpdump -nei veth0 Splat looks like: [ 175.852292][ C3] protocol 88fb is buggy, dev veth0 Fixes: f421436a591d ("net/hsr: Add support for the High-availability Seamless Redundancy protocol (HSRv0)") Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- net/hsr/hsr_device.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index 62c03f0d0079..c7bd6c49fadf 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -272,6 +272,8 @@ static void send_hsr_supervision_frame(struct hsr_port *master, skb->dev->dev_addr, skb->len) <= 0) goto out; skb_reset_mac_header(skb); + skb_reset_network_header(skb); + skb_reset_transport_header(skb); if (hsr_ver > 0) { hsr_tag = skb_put(skb, sizeof(struct hsr_tag)); From a5bcd72e054aabb93ddc51ed8cde36a5bfc50271 Mon Sep 17 00:00:00 2001 From: Vladyslav Tarasiuk Date: Thu, 26 Dec 2019 10:41:56 +0200 Subject: [PATCH 064/146] net/mlxfw: Fix out-of-memory error in mfa2 flash burning The burning process requires to perform internal allocations of large chunks of memory. This memory doesn't need to be contiguous and can be safely allocated by vzalloc() instead of kzalloc(). This patch changes such allocation to avoid possible out-of-memory failure. Fixes: 410ed13cae39 ("Add the mlxfw module for Mellanox firmware flash process") Signed-off-by: Vladyslav Tarasiuk Reviewed-by: Aya Levin Signed-off-by: Leon Romanovsky Tested-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2.c b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2.c index 544344ac4894..79057af4fe99 100644 --- a/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2.c +++ b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include "mlxfw_mfa2.h" #include "mlxfw_mfa2_file.h" @@ -548,7 +549,7 @@ mlxfw_mfa2_file_component_get(const struct mlxfw_mfa2_file *mfa2_file, comp_size = be32_to_cpu(comp->size); comp_buf_size = comp_size + mlxfw_mfa2_comp_magic_len; - comp_data = kmalloc(sizeof(*comp_data) + comp_buf_size, GFP_KERNEL); + comp_data = vzalloc(sizeof(*comp_data) + comp_buf_size); if (!comp_data) return ERR_PTR(-ENOMEM); comp_data->comp.data_size = comp_size; @@ -570,7 +571,7 @@ mlxfw_mfa2_file_component_get(const struct mlxfw_mfa2_file *mfa2_file, comp_data->comp.data = comp_data->buff + mlxfw_mfa2_comp_magic_len; return &comp_data->comp; err_out: - kfree(comp_data); + vfree(comp_data); return ERR_PTR(err); } @@ -579,7 +580,7 @@ void mlxfw_mfa2_file_component_put(struct mlxfw_mfa2_component *comp) const struct mlxfw_mfa2_comp_data *comp_data; comp_data = container_of(comp, struct mlxfw_mfa2_comp_data, comp); - kfree(comp_data); + vfree(comp_data); } void mlxfw_mfa2_file_fini(struct mlxfw_mfa2_file *mfa2_file) From c27569fcd6e1b11bd24361346504f2995a256e4e Mon Sep 17 00:00:00 2001 From: Madalin Bucur Date: Mon, 23 Dec 2019 09:39:22 +0200 Subject: [PATCH 065/146] dpaa_eth: fix DMA mapping leak On the error path some fragments remain DMA mapped. Adding a fix that unmaps all the fragments. Rework cleanup path to be simpler. Fixes: 8151ee88bad5 ("dpaa_eth: use page backed rx buffers") Signed-off-by: Madalin Bucur Signed-off-by: David S. Miller --- .../net/ethernet/freescale/dpaa/dpaa_eth.c | 39 ++++++++++--------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c index 6a9d12dad5d9..a301f0095223 100644 --- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c +++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c @@ -1719,7 +1719,7 @@ static struct sk_buff *sg_fd_to_skb(const struct dpaa_priv *priv, int page_offset; unsigned int sz; int *count_ptr; - int i; + int i, j; vaddr = phys_to_virt(addr); WARN_ON(!IS_ALIGNED((unsigned long)vaddr, SMP_CACHE_BYTES)); @@ -1736,14 +1736,14 @@ static struct sk_buff *sg_fd_to_skb(const struct dpaa_priv *priv, WARN_ON(!IS_ALIGNED((unsigned long)sg_vaddr, SMP_CACHE_BYTES)); + dma_unmap_page(priv->rx_dma_dev, sg_addr, + DPAA_BP_RAW_SIZE, DMA_FROM_DEVICE); + /* We may use multiple Rx pools */ dpaa_bp = dpaa_bpid2pool(sgt[i].bpid); if (!dpaa_bp) goto free_buffers; - count_ptr = this_cpu_ptr(dpaa_bp->percpu_count); - dma_unmap_page(priv->rx_dma_dev, sg_addr, - DPAA_BP_RAW_SIZE, DMA_FROM_DEVICE); if (!skb) { sz = dpaa_bp->size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); @@ -1786,7 +1786,9 @@ static struct sk_buff *sg_fd_to_skb(const struct dpaa_priv *priv, skb_add_rx_frag(skb, i - 1, head_page, frag_off, frag_len, dpaa_bp->size); } + /* Update the pool count for the current {cpu x bpool} */ + count_ptr = this_cpu_ptr(dpaa_bp->percpu_count); (*count_ptr)--; if (qm_sg_entry_is_final(&sgt[i])) @@ -1800,26 +1802,25 @@ static struct sk_buff *sg_fd_to_skb(const struct dpaa_priv *priv, return skb; free_buffers: - /* compensate sw bpool counter changes */ - for (i--; i >= 0; i--) { - dpaa_bp = dpaa_bpid2pool(sgt[i].bpid); - if (dpaa_bp) { - count_ptr = this_cpu_ptr(dpaa_bp->percpu_count); - (*count_ptr)++; - } - } /* free all the SG entries */ - for (i = 0; i < DPAA_SGT_MAX_ENTRIES ; i++) { - sg_addr = qm_sg_addr(&sgt[i]); + for (j = 0; j < DPAA_SGT_MAX_ENTRIES ; j++) { + sg_addr = qm_sg_addr(&sgt[j]); sg_vaddr = phys_to_virt(sg_addr); + /* all pages 0..i were unmaped */ + if (j > i) + dma_unmap_page(priv->rx_dma_dev, qm_sg_addr(&sgt[j]), + DPAA_BP_RAW_SIZE, DMA_FROM_DEVICE); free_pages((unsigned long)sg_vaddr, 0); - dpaa_bp = dpaa_bpid2pool(sgt[i].bpid); - if (dpaa_bp) { - count_ptr = this_cpu_ptr(dpaa_bp->percpu_count); - (*count_ptr)--; + /* counters 0..i-1 were decremented */ + if (j >= i) { + dpaa_bp = dpaa_bpid2pool(sgt[j].bpid); + if (dpaa_bp) { + count_ptr = this_cpu_ptr(dpaa_bp->percpu_count); + (*count_ptr)--; + } } - if (qm_sg_entry_is_final(&sgt[i])) + if (qm_sg_entry_is_final(&sgt[j])) break; } /* free the SGT fragment */ From 1c93fb45761e79b3c00080e71523886cefaf351c Mon Sep 17 00:00:00 2001 From: Madalin Bucur Date: Mon, 23 Dec 2019 10:06:10 +0200 Subject: [PATCH 066/146] net: phy: aquantia: add suspend / resume ops for AQR105 The suspend/resume code for AQR107 works on AQR105 too. This patch fixes issues with the partner not seeing the link down when the interface using AQR105 is brought down. Fixes: bee8259dd31f ("net: phy: add driver for aquantia phy") Signed-off-by: Madalin Bucur Signed-off-by: David S. Miller --- drivers/net/phy/aquantia_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/phy/aquantia_main.c b/drivers/net/phy/aquantia_main.c index 3b29d381116f..975789d9349d 100644 --- a/drivers/net/phy/aquantia_main.c +++ b/drivers/net/phy/aquantia_main.c @@ -627,6 +627,8 @@ static struct phy_driver aqr_driver[] = { .config_intr = aqr_config_intr, .ack_interrupt = aqr_ack_interrupt, .read_status = aqr_read_status, + .suspend = aqr107_suspend, + .resume = aqr107_resume, }, { PHY_ID_MATCH_MODEL(PHY_ID_AQR106), From 7df2281a174bd0fdbb2211a26914e5440740fcde Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 23 Dec 2019 11:03:21 +0100 Subject: [PATCH 067/146] of: mdio: Add missing inline to of_mdiobus_child_is_phy() dummy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If CONFIG_OF_MDIO=n: drivers/net/phy/mdio_bus.c:23: include/linux/of_mdio.h:58:13: warning: ‘of_mdiobus_child_is_phy’ defined but not used [-Wunused-function] static bool of_mdiobus_child_is_phy(struct device_node *child) ^~~~~~~~~~~~~~~~~~~~~~~ Fix this by adding the missing "inline" keyword. Fixes: 0aa4d016c043d16a ("of: mdio: export of_mdiobus_child_is_phy") Signed-off-by: Geert Uytterhoeven Reviewed-by: Andrew Lunn Acked-by: Borislav Petkov Signed-off-by: David S. Miller --- include/linux/of_mdio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/of_mdio.h b/include/linux/of_mdio.h index 79bc82e30c02..491a2b7e77c1 100644 --- a/include/linux/of_mdio.h +++ b/include/linux/of_mdio.h @@ -55,7 +55,7 @@ static inline int of_mdio_parse_addr(struct device *dev, } #else /* CONFIG_OF_MDIO */ -static bool of_mdiobus_child_is_phy(struct device_node *child) +static inline bool of_mdiobus_child_is_phy(struct device_node *child) { return false; } From 0444716a5dd563526e53ae686115987d5d4c249e Mon Sep 17 00:00:00 2001 From: Manish Chopra Date: Mon, 23 Dec 2019 10:23:08 -0800 Subject: [PATCH 068/146] bnx2x: Use appropriate define for vlan credit Although it has same value as MAX_MAC_CREDIT_E2, use MAX_VLAN_CREDIT_E2 appropriately. Signed-off-by: Manish Chopra Signed-off-by: Ariel Elior Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.h index 7a6e82db4231..ed237854939a 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.h +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.h @@ -1537,7 +1537,7 @@ void bnx2x_get_rss_ind_table(struct bnx2x_rss_config_obj *rss_obj, func_num + GET_NUM_VFS_PER_PF(bp) * VF_MAC_CREDIT_CNT) #define PF_VLAN_CREDIT_E2(bp, func_num) \ - ((MAX_MAC_CREDIT_E2 - GET_NUM_VFS_PER_PATH(bp) * VF_VLAN_CREDIT_CNT) / \ + ((MAX_VLAN_CREDIT_E2 - GET_NUM_VFS_PER_PATH(bp) * VF_VLAN_CREDIT_CNT) /\ func_num + GET_NUM_VFS_PER_PF(bp) * VF_VLAN_CREDIT_CNT) #endif /* BNX2X_SP_VERBS */ From 5cdc40c7820ff66c2271e0884bd8ee8f7cfd769b Mon Sep 17 00:00:00 2001 From: Manish Chopra Date: Mon, 23 Dec 2019 10:23:09 -0800 Subject: [PATCH 069/146] bnx2x: Fix accounting of vlan resources among the PFs While testing max vlan configuration on the PF, firmware gets assert as driver was configuring number of vlans more than what is supported per port/engine, it was figured out that there is an implicit vlan (hidden default vlan consuming hardware cam entry resource) which is configured default for all the clients (PF/VFs) on client_init ramrod by the adapter implicitly, so when allocating resources among the PFs this implicit vlan should be considered or total vlan entries should be reduced by one to accommodate that default/implicit vlan entry. Signed-off-by: Manish Chopra Signed-off-by: Ariel Elior Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.h index ed237854939a..bacc8552bce1 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.h +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.h @@ -1536,8 +1536,11 @@ void bnx2x_get_rss_ind_table(struct bnx2x_rss_config_obj *rss_obj, ((MAX_MAC_CREDIT_E2 - GET_NUM_VFS_PER_PATH(bp) * VF_MAC_CREDIT_CNT) / \ func_num + GET_NUM_VFS_PER_PF(bp) * VF_MAC_CREDIT_CNT) +#define BNX2X_VFS_VLAN_CREDIT(bp) \ + (GET_NUM_VFS_PER_PATH(bp) * VF_VLAN_CREDIT_CNT) + #define PF_VLAN_CREDIT_E2(bp, func_num) \ - ((MAX_VLAN_CREDIT_E2 - GET_NUM_VFS_PER_PATH(bp) * VF_VLAN_CREDIT_CNT) /\ + ((MAX_VLAN_CREDIT_E2 - 1 - BNX2X_VFS_VLAN_CREDIT(bp)) / \ func_num + GET_NUM_VFS_PER_PF(bp) * VF_VLAN_CREDIT_CNT) #endif /* BNX2X_SP_VERBS */ From bb3d0b8bf5be61ab1d6f472c43cbf34de17e796b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 23 Dec 2019 11:13:24 -0800 Subject: [PATCH 070/146] net_sched: sch_fq: properly set sk->sk_pacing_status If fq_classify() recycles a struct fq_flow because a socket structure has been reallocated, we do not set sk->sk_pacing_status immediately, but later if the flow becomes detached. This means that any flow requiring pacing (BBR, or SO_MAX_PACING_RATE) might fallback to TCP internal pacing, which requires a per-socket high resolution timer, and therefore more cpu cycles. Fixes: 218af599fa63 ("tcp: internal implementation for pacing") Signed-off-by: Eric Dumazet Cc: Soheil Hassas Yeganeh Cc: Neal Cardwell Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/sched/sch_fq.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index b1c7e726ce5d..ff4c5e9d0d77 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -301,6 +301,9 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) f->socket_hash != sk->sk_hash)) { f->credit = q->initial_quantum; f->socket_hash = sk->sk_hash; + if (q->rate_enable) + smp_store_release(&sk->sk_pacing_status, + SK_PACING_FQ); if (fq_flow_is_throttled(f)) fq_flow_unset_throttled(q, f); f->time_next_packet = 0ULL; @@ -322,8 +325,12 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) fq_flow_set_detached(f); f->sk = sk; - if (skb->sk == sk) + if (skb->sk == sk) { f->socket_hash = sk->sk_hash; + if (q->rate_enable) + smp_store_release(&sk->sk_pacing_status, + SK_PACING_FQ); + } f->credit = q->initial_quantum; rb_link_node(&f->fq_node, parent, p); @@ -428,17 +435,9 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, f->qlen++; qdisc_qstats_backlog_inc(sch, skb); if (fq_flow_is_detached(f)) { - struct sock *sk = skb->sk; - fq_flow_add_tail(&q->new_flows, f); if (time_after(jiffies, f->age + q->flow_refill_delay)) f->credit = max_t(u32, f->credit, q->quantum); - if (sk && q->rate_enable) { - if (unlikely(smp_load_acquire(&sk->sk_pacing_status) != - SK_PACING_FQ)) - smp_store_release(&sk->sk_pacing_status, - SK_PACING_FQ); - } q->inactive_flows--; } From b0b5ce1010ffc50015eaec72b0028aaae3f526bb Mon Sep 17 00:00:00 2001 From: "Alexander.Barabash@dell.com" Date: Wed, 25 Dec 2019 17:55:30 +0000 Subject: [PATCH 071/146] ioat: ioat_alloc_ring() failure handling. If dma_alloc_coherent() returns NULL in ioat_alloc_ring(), ring allocation must not proceed. Until now, if the first call to dma_alloc_coherent() in ioat_alloc_ring() returned NULL, the processing could proceed, failing with NULL-pointer dereferencing further down the line. Signed-off-by: Alexander Barabash Acked-by: Dave Jiang Link: https://lore.kernel.org/r/75e9c0e84c3345d693c606c64f8b9ab5@x13pwhopdag1307.AMER.DELL.COM Signed-off-by: Vinod Koul --- drivers/dma/ioat/dma.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/dma/ioat/dma.c b/drivers/dma/ioat/dma.c index 1a422a8b43cf..18c011e57592 100644 --- a/drivers/dma/ioat/dma.c +++ b/drivers/dma/ioat/dma.c @@ -377,10 +377,11 @@ ioat_alloc_ring(struct dma_chan *c, int order, gfp_t flags) descs->virt = dma_alloc_coherent(to_dev(ioat_chan), SZ_2M, &descs->hw, flags); - if (!descs->virt && (i > 0)) { + if (!descs->virt) { int idx; for (idx = 0; idx < i; idx++) { + descs = &ioat_chan->descs[idx]; dma_free_coherent(to_dev(ioat_chan), SZ_2M, descs->virt, descs->hw); descs->virt = NULL; From e79c22695abd3b75a6aecf4ea4b9607e8d82c49c Mon Sep 17 00:00:00 2001 From: Kailang Yang Date: Thu, 19 Dec 2019 14:12:15 +0800 Subject: [PATCH 072/146] ALSA: hda/realtek - Add Bass Speaker and fixed dac for bass speaker Dell has new platform which has dual speaker connecting. They want dual speaker which use same dac for output. Signed-off-by: Kailang Yang Cc: Link: https://lore.kernel.org/r/229c7efa2b474a16b7d8a916cd096b68@realtek.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 5bc1a6d24333..2ee703c2da78 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -5908,6 +5908,8 @@ enum { ALC294_FIXUP_ASUS_INTSPK_HEADSET_MIC, ALC256_FIXUP_MEDION_HEADSET_NO_PRESENCE, ALC294_FIXUP_ASUS_INTSPK_GPIO, + ALC289_FIXUP_DELL_SPK2, + ALC289_FIXUP_DUAL_SPK, }; static const struct hda_fixup alc269_fixups[] = { @@ -7009,6 +7011,21 @@ static const struct hda_fixup alc269_fixups[] = { .chained = true, .chain_id = ALC294_FIXUP_ASUS_INTSPK_HEADSET_MIC }, + [ALC289_FIXUP_DELL_SPK2] = { + .type = HDA_FIXUP_PINS, + .v.pins = (const struct hda_pintbl[]) { + { 0x17, 0x90170130 }, /* bass spk */ + { } + }, + .chained = true, + .chain_id = ALC269_FIXUP_DELL4_MIC_NO_PRESENCE + }, + [ALC289_FIXUP_DUAL_SPK] = { + .type = HDA_FIXUP_FUNC, + .v.func = alc285_fixup_speaker2_to_dac1, + .chained = true, + .chain_id = ALC289_FIXUP_DELL_SPK2 + }, }; static const struct snd_pci_quirk alc269_fixup_tbl[] = { @@ -7081,6 +7098,8 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1028, 0x08ad, "Dell WYSE AIO", ALC225_FIXUP_DELL_WYSE_AIO_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1028, 0x08ae, "Dell WYSE NB", ALC225_FIXUP_DELL1_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1028, 0x0935, "Dell", ALC274_FIXUP_DELL_AIO_LINEOUT_VERB), + SND_PCI_QUIRK(0x1028, 0x097e, "Dell Precision", ALC289_FIXUP_DUAL_SPK), + SND_PCI_QUIRK(0x1028, 0x097d, "Dell Precision", ALC289_FIXUP_DUAL_SPK), SND_PCI_QUIRK(0x1028, 0x164a, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1028, 0x164b, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x103c, 0x1586, "HP", ALC269_FIXUP_HP_MUTE_LED_MIC2), From 70cf3dc7313207816255b9acb0dffb19dae78144 Mon Sep 17 00:00:00 2001 From: Shmulik Ladkani Date: Wed, 25 Dec 2019 10:51:01 +0200 Subject: [PATCH 073/146] net/sched: act_mirred: Pull mac prior redir to non mac_header_xmit device There's no skb_pull performed when a mirred action is set at egress of a mac device, with a target device/action that expects skb->data to point at the network header. As a result, either the target device is errornously given an skb with data pointing to the mac (egress case), or the net stack receives the skb with data pointing to the mac (ingress case). E.g: # tc qdisc add dev eth9 root handle 1: prio # tc filter add dev eth9 parent 1: prio 9 protocol ip handle 9 basic \ action mirred egress redirect dev tun0 (tun0 is a tun device. result: tun0 errornously gets the eth header instead of the iph) Revise the push/pull logic of tcf_mirred_act() to not rely on the skb_at_tc_ingress() vs tcf_mirred_act_wants_ingress() comparison, as it does not cover all "pull" cases. Instead, calculate whether the required action on the target device requires the data to point at the network header, and compare this to whether skb->data points to network header - and make the push/pull adjustments as necessary. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Shmulik Ladkani Tested-by: Jamal Hadi Salim Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_mirred.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 1e3eb3a97532..1ad300e6dbc0 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -219,8 +219,10 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, bool use_reinsert; bool want_ingress; bool is_redirect; + bool expects_nh; int m_eaction; int mac_len; + bool at_nh; rec_level = __this_cpu_inc_return(mirred_rec_level); if (unlikely(rec_level > MIRRED_RECURSION_LIMIT)) { @@ -261,19 +263,19 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, goto out; } - /* If action's target direction differs than filter's direction, - * and devices expect a mac header on xmit, then mac push/pull is - * needed. - */ want_ingress = tcf_mirred_act_wants_ingress(m_eaction); - if (skb_at_tc_ingress(skb) != want_ingress && m_mac_header_xmit) { - if (!skb_at_tc_ingress(skb)) { - /* caught at egress, act ingress: pull mac */ - mac_len = skb_network_header(skb) - skb_mac_header(skb); + + expects_nh = want_ingress || !m_mac_header_xmit; + at_nh = skb->data == skb_network_header(skb); + if (at_nh != expects_nh) { + mac_len = skb_at_tc_ingress(skb) ? skb->mac_len : + skb_network_header(skb) - skb_mac_header(skb); + if (expects_nh) { + /* target device/action expect data at nh */ skb_pull_rcsum(skb2, mac_len); } else { - /* caught at ingress, act egress: push mac */ - skb_push_rcsum(skb2, skb->mac_len); + /* target device/action expect data at mac */ + skb_push_rcsum(skb2, mac_len); } } From bd6f48546b9cb7a785344fc78058c420923d7ed8 Mon Sep 17 00:00:00 2001 From: Martin Blumenstingl Date: Thu, 26 Dec 2019 20:01:01 +0100 Subject: [PATCH 074/146] net: stmmac: dwmac-meson8b: Fix the RGMII TX delay on Meson8b/8m2 SoCs GXBB and newer SoCs use the fixed FCLK_DIV2 (1GHz) clock as input for the m250_sel clock. Meson8b and Meson8m2 use MPLL2 instead, whose rate can be adjusted at runtime. So far we have been running MPLL2 with ~250MHz (and the internal m250_div with value 1), which worked enough that we could transfer data with an TX delay of 4ns. Unfortunately there is high packet loss with an RGMII PHY when transferring data (receiving data works fine though). Odroid-C1's u-boot is running with a TX delay of only 2ns as well as the internal m250_div set to 2 - no lost (TX) packets can be observed with that setting in u-boot. Manual testing has shown that the TX packet loss goes away when using the following settings in Linux (the vendor kernel uses the same settings): - MPLL2 clock set to ~500MHz - m250_div set to 2 - TX delay set to 2ns on the MAC side Update the m250_div divider settings to only accept dividers greater or equal 2 to fix the TX delay generated by the MAC. iperf3 results before the change: [ ID] Interval Transfer Bitrate Retr [ 5] 0.00-10.00 sec 182 MBytes 153 Mbits/sec 514 sender [ 5] 0.00-10.00 sec 182 MBytes 152 Mbits/sec receiver iperf3 results after the change (including an updated TX delay of 2ns): [ ID] Interval Transfer Bitrate Retr Cwnd [ 5] 0.00-10.00 sec 927 MBytes 778 Mbits/sec 0 sender [ 5] 0.00-10.01 sec 927 MBytes 777 Mbits/sec receiver Fixes: 4f6a71b84e1afd ("net: stmmac: dwmac-meson8b: fix internal RGMII clock configuration") Signed-off-by: Martin Blumenstingl Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- .../net/ethernet/stmicro/stmmac/dwmac-meson8b.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c index bd6c01004913..0e2fa14f1423 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c @@ -112,6 +112,14 @@ static int meson8b_init_rgmii_tx_clk(struct meson8b_dwmac *dwmac) struct device *dev = dwmac->dev; const char *parent_name, *mux_parent_names[MUX_CLK_NUM_PARENTS]; struct meson8b_dwmac_clk_configs *clk_configs; + static const struct clk_div_table div_table[] = { + { .div = 2, .val = 2, }, + { .div = 3, .val = 3, }, + { .div = 4, .val = 4, }, + { .div = 5, .val = 5, }, + { .div = 6, .val = 6, }, + { .div = 7, .val = 7, }, + }; clk_configs = devm_kzalloc(dev, sizeof(*clk_configs), GFP_KERNEL); if (!clk_configs) @@ -146,9 +154,9 @@ static int meson8b_init_rgmii_tx_clk(struct meson8b_dwmac *dwmac) clk_configs->m250_div.reg = dwmac->regs + PRG_ETH0; clk_configs->m250_div.shift = PRG_ETH0_CLK_M250_DIV_SHIFT; clk_configs->m250_div.width = PRG_ETH0_CLK_M250_DIV_WIDTH; - clk_configs->m250_div.flags = CLK_DIVIDER_ONE_BASED | - CLK_DIVIDER_ALLOW_ZERO | - CLK_DIVIDER_ROUND_CLOSEST; + clk_configs->m250_div.table = div_table; + clk_configs->m250_div.flags = CLK_DIVIDER_ALLOW_ZERO | + CLK_DIVIDER_ROUND_CLOSEST; clk = meson8b_dwmac_register_clk(dwmac, "m250_div", &parent_name, 1, &clk_divider_ops, &clk_configs->m250_div.hw); From 85a8ce62c2eabe28b9d76ca4eecf37922402df93 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 28 Dec 2019 07:05:48 +0800 Subject: [PATCH 075/146] block: add bio_truncate to fix guard_bio_eod Some filesystem, such as vfat, may send bio which crosses device boundary, and the worse thing is that the IO request starting within device boundaries can contain more than one segment past EOD. Commit dce30ca9e3b6 ("fs: fix guard_bio_eod to check for real EOD errors") tries to fix this issue by returning -EIO for this situation. However, this way lets fs user code lose chance to handle -EIO, then sync_inodes_sb() may hang for ever. Also the current truncating on last segment is dangerous by updating the last bvec, given bvec table becomes not immutable any more, and fs bio users may not retrieve the truncated pages via bio_for_each_segment_all() in its .end_io callback. Fixes this issue by supporting multi-segment truncating. And the approach is simpler: - just update bio size since block layer can make correct bvec with the updated bio size. Then bvec table becomes really immutable. - zero all truncated segments for read bio Cc: Carlos Maiolino Cc: linux-fsdevel@vger.kernel.org Fixed-by: dce30ca9e3b6 ("fs: fix guard_bio_eod to check for real EOD errors") Reported-by: syzbot+2b9e54155c8c25d8d165@syzkaller.appspotmail.com Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/bio.c | 39 +++++++++++++++++++++++++++++++++++++++ fs/buffer.c | 25 +------------------------ include/linux/bio.h | 1 + 3 files changed, 41 insertions(+), 24 deletions(-) diff --git a/block/bio.c b/block/bio.c index a5d75f6bf4c7..006bcc52a77e 100644 --- a/block/bio.c +++ b/block/bio.c @@ -538,6 +538,45 @@ void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start) } EXPORT_SYMBOL(zero_fill_bio_iter); +void bio_truncate(struct bio *bio, unsigned new_size) +{ + struct bio_vec bv; + struct bvec_iter iter; + unsigned int done = 0; + bool truncated = false; + + if (new_size >= bio->bi_iter.bi_size) + return; + + if (bio_data_dir(bio) != READ) + goto exit; + + bio_for_each_segment(bv, bio, iter) { + if (done + bv.bv_len > new_size) { + unsigned offset; + + if (!truncated) + offset = new_size - done; + else + offset = 0; + zero_user(bv.bv_page, offset, bv.bv_len - offset); + truncated = true; + } + done += bv.bv_len; + } + + exit: + /* + * Don't touch bvec table here and make it really immutable, since + * fs bio user has to retrieve all pages via bio_for_each_segment_all + * in its .end_bio() callback. + * + * It is enough to truncate bio by updating .bi_size since we can make + * correct bvec with the updated .bi_size for drivers. + */ + bio->bi_iter.bi_size = new_size; +} + /** * bio_put - release a reference to a bio * @bio: bio to release reference to diff --git a/fs/buffer.c b/fs/buffer.c index d8c7242426bb..e94a6619464c 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3034,8 +3034,6 @@ static void end_bio_bh_io_sync(struct bio *bio) void guard_bio_eod(int op, struct bio *bio) { sector_t maxsector; - struct bio_vec *bvec = bio_last_bvec_all(bio); - unsigned truncated_bytes; struct hd_struct *part; rcu_read_lock(); @@ -3061,28 +3059,7 @@ void guard_bio_eod(int op, struct bio *bio) if (likely((bio->bi_iter.bi_size >> 9) <= maxsector)) return; - /* Uhhuh. We've got a bio that straddles the device size! */ - truncated_bytes = bio->bi_iter.bi_size - (maxsector << 9); - - /* - * The bio contains more than one segment which spans EOD, just return - * and let IO layer turn it into an EIO - */ - if (truncated_bytes > bvec->bv_len) - return; - - /* Truncate the bio.. */ - bio->bi_iter.bi_size -= truncated_bytes; - bvec->bv_len -= truncated_bytes; - - /* ..and clear the end of the buffer for reads */ - if (op == REQ_OP_READ) { - struct bio_vec bv; - - mp_bvec_last_segment(bvec, &bv); - zero_user(bv.bv_page, bv.bv_offset + bv.bv_len, - truncated_bytes); - } + bio_truncate(bio, maxsector << 9); } static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, diff --git a/include/linux/bio.h b/include/linux/bio.h index 3cdb84cdc488..853d92ceee64 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -470,6 +470,7 @@ extern struct bio *bio_copy_user_iov(struct request_queue *, gfp_t); extern int bio_uncopy_user(struct bio *); void zero_fill_bio_iter(struct bio *bio, struct bvec_iter iter); +void bio_truncate(struct bio *bio, unsigned new_size); static inline void zero_fill_bio(struct bio *bio) { From 5d30ed3c2c74eb123668d0746e23105c8fc8aed3 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 20 Dec 2019 18:57:16 -0500 Subject: [PATCH 076/146] Revert "drm/amdgpu: simplify ATPX detection" This reverts commit f5fda6d89afe6e9cedaa1c3303903c905262f6e8. You can't use BASE_CLASS in pci_get_class. Bug: https://gitlab.freedesktop.org/drm/amd/issues/995 Acked-by: Evan Quan Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_atpx_handler.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_atpx_handler.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_atpx_handler.c index a97fb759e2f4..3e35a8f2c5e5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_atpx_handler.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_atpx_handler.c @@ -613,7 +613,17 @@ static bool amdgpu_atpx_detect(void) bool d3_supported = false; struct pci_dev *parent_pdev; - while ((pdev = pci_get_class(PCI_BASE_CLASS_DISPLAY << 16, pdev)) != NULL) { + while ((pdev = pci_get_class(PCI_CLASS_DISPLAY_VGA << 8, pdev)) != NULL) { + vga_count++; + + has_atpx |= (amdgpu_atpx_pci_probe_handle(pdev) == true); + + parent_pdev = pci_upstream_bridge(pdev); + d3_supported |= parent_pdev && parent_pdev->bridge_d3; + amdgpu_atpx_get_quirks(pdev); + } + + while ((pdev = pci_get_class(PCI_CLASS_DISPLAY_OTHER << 8, pdev)) != NULL) { vga_count++; has_atpx |= (amdgpu_atpx_pci_probe_handle(pdev) == true); From 073d5eef9e043c2b7e3ef12bc6c879b1d248e831 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 17 Dec 2019 09:35:01 -0500 Subject: [PATCH 077/146] drm/amdgpu/smu: add metrics table lock This table is used for lots of things, add it's own lock. Bug: https://gitlab.freedesktop.org/drm/amd/issues/900 Reviewed-by: Kevin Wang Reviewed-by: Evan Quan Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/powerplay/amdgpu_smu.c | 1 + drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c index 5ff7ccedfbed..a23729d3174b 100644 --- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c @@ -866,6 +866,7 @@ static int smu_sw_init(void *handle) smu->smu_baco.platform_support = false; mutex_init(&smu->sensor_lock); + mutex_init(&smu->metrics_lock); smu->watermarks_bitmap = 0; smu->power_profile_mode = PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT; diff --git a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h index ac9758305ab3..41fce75b263f 100644 --- a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h +++ b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h @@ -349,6 +349,7 @@ struct smu_context const struct pptable_funcs *ppt_funcs; struct mutex mutex; struct mutex sensor_lock; + struct mutex metrics_lock; uint64_t pool_size; struct smu_table_context smu_table; From 1da87c9f67c98d552679974dbfc1f0f65b6a0a53 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 17 Dec 2019 09:49:52 -0500 Subject: [PATCH 078/146] drm/amdgpu/smu: add metrics table lock for arcturus (v2) To protect access to the metrics table. v2: unlock on error Bug: https://gitlab.freedesktop.org/drm/amd/issues/900 Reviewed-by: Kevin Wang Reviewed-by: Evan Quan Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/powerplay/arcturus_ppt.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c b/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c index cc71a1078a7a..472e9fed411a 100644 --- a/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c +++ b/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c @@ -862,18 +862,21 @@ static int arcturus_get_metrics_table(struct smu_context *smu, struct smu_table_context *smu_table= &smu->smu_table; int ret = 0; + mutex_lock(&smu->metrics_lock); if (!smu_table->metrics_time || time_after(jiffies, smu_table->metrics_time + HZ / 1000)) { ret = smu_update_table(smu, SMU_TABLE_SMU_METRICS, 0, (void *)smu_table->metrics_table, false); if (ret) { pr_info("Failed to export SMU metrics table!\n"); + mutex_unlock(&smu->metrics_lock); return ret; } smu_table->metrics_time = jiffies; } memcpy(metrics_table, smu_table->metrics_table, sizeof(SmuMetrics_t)); + mutex_unlock(&smu->metrics_lock); return ret; } From e0e384c398d4638e54b6d2098f0ceaafdab870ee Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 17 Dec 2019 09:50:42 -0500 Subject: [PATCH 079/146] drm/amdgpu/smu: add metrics table lock for navi (v2) To protect access to the metrics table. v2: unlock on error Bug: https://gitlab.freedesktop.org/drm/amd/issues/900 Reviewed-by: Kevin Wang Reviewed-by: Evan Quan Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/powerplay/navi10_ppt.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c index 4a14fd1f9fd5..ca62e92e5a4f 100644 --- a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c +++ b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c @@ -562,17 +562,20 @@ static int navi10_get_metrics_table(struct smu_context *smu, struct smu_table_context *smu_table= &smu->smu_table; int ret = 0; + mutex_lock(&smu->metrics_lock); if (!smu_table->metrics_time || time_after(jiffies, smu_table->metrics_time + msecs_to_jiffies(100))) { ret = smu_update_table(smu, SMU_TABLE_SMU_METRICS, 0, (void *)smu_table->metrics_table, false); if (ret) { pr_info("Failed to export SMU metrics table!\n"); + mutex_unlock(&smu->metrics_lock); return ret; } smu_table->metrics_time = jiffies; } memcpy(metrics_table, smu_table->metrics_table, sizeof(SmuMetrics_t)); + mutex_unlock(&smu->metrics_lock); return ret; } From 1c455101c6d10c99b310d6bcf613244c97854012 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 17 Dec 2019 09:51:40 -0500 Subject: [PATCH 080/146] drm/amdgpu/smu: add metrics table lock for vega20 (v2) To protect access to the metrics table. v2: unlock on error Bug: https://gitlab.freedesktop.org/drm/amd/issues/900 Reviewed-by: Kevin Wang Reviewed-by: Evan Quan Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/powerplay/vega20_ppt.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/powerplay/vega20_ppt.c b/drivers/gpu/drm/amd/powerplay/vega20_ppt.c index 60b9ff097142..0d3a3b0a934e 100644 --- a/drivers/gpu/drm/amd/powerplay/vega20_ppt.c +++ b/drivers/gpu/drm/amd/powerplay/vega20_ppt.c @@ -1678,17 +1678,20 @@ static int vega20_get_metrics_table(struct smu_context *smu, struct smu_table_context *smu_table= &smu->smu_table; int ret = 0; + mutex_lock(&smu->metrics_lock); if (!smu_table->metrics_time || time_after(jiffies, smu_table->metrics_time + HZ / 1000)) { ret = smu_update_table(smu, SMU_TABLE_SMU_METRICS, 0, (void *)smu_table->metrics_table, false); if (ret) { pr_info("Failed to export SMU metrics table!\n"); + mutex_unlock(&smu->metrics_lock); return ret; } smu_table->metrics_time = jiffies; } memcpy(metrics_table, smu_table->metrics_table, sizeof(SmuMetrics_t)); + mutex_unlock(&smu->metrics_lock); return ret; } From e0c63812352298efbce2a71483c1dab627d0c288 Mon Sep 17 00:00:00 2001 From: changzhu Date: Thu, 12 Dec 2019 13:46:06 +0800 Subject: [PATCH 081/146] drm/amdgpu: enable gfxoff for raven1 refresh When smu version is larger than 0x41e2b, it will load raven_kicker_rlc.bin.To enable gfxoff for raven_kicker_rlc.bin,it needs to avoid adev->pm.pp_feature &= ~PP_GFXOFF_MASK when it loads raven_kicker_rlc.bin. Signed-off-by: changzhu Reviewed-by: Huang Rui Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index 66328ffa395a..97105a5bb246 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -1052,17 +1052,10 @@ static void gfx_v9_0_check_if_need_gfxoff(struct amdgpu_device *adev) case CHIP_VEGA20: break; case CHIP_RAVEN: - /* Disable GFXOFF on original raven. There are combinations - * of sbios and platforms that are not stable. - */ - if (!(adev->rev_id >= 0x8 || adev->pdev->device == 0x15d8)) - adev->pm.pp_feature &= ~PP_GFXOFF_MASK; - else if (!(adev->rev_id >= 0x8 || adev->pdev->device == 0x15d8) - &&((adev->gfx.rlc_fw_version != 106 && - adev->gfx.rlc_fw_version < 531) || - (adev->gfx.rlc_fw_version == 53815) || - (adev->gfx.rlc_feature_version < 1) || - !adev->gfx.rlc.is_rlc_v2_1)) + if (!(adev->rev_id >= 0x8 || + adev->pdev->device == 0x15d8) && + (adev->pm.fw_version < 0x41e2b || /* not raven1 fresh */ + !adev->gfx.rlc.is_rlc_v2_1)) /* without rlc save restore ucodes */ adev->pm.pp_feature &= ~PP_GFXOFF_MASK; if (adev->pm.pp_feature & PP_GFXOFF_MASK) From 0aec96f5897ac16ad9945f531b4bef9a2edd2ebd Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 18 Dec 2019 20:26:06 +0100 Subject: [PATCH 082/146] ALSA: ice1724: Fix sleep-in-atomic in Infrasonic Quartet support code Jia-Ju Bai reported a possible sleep-in-atomic scenario in the ice1724 driver with Infrasonic Quartet support code: namely, ice->set_rate callback gets called inside ice->reg_lock spinlock, while the callback in quartet.c holds ice->gpio_mutex. This patch fixes the invalid call: it simply moves the calls of ice->set_rate and ice->set_mclk callbacks outside the spinlock. Reported-by: Jia-Ju Bai Cc: Link: https://lore.kernel.org/r/5d43135e-73b9-a46a-2155-9e91d0dcdf83@gmail.com Link: https://lore.kernel.org/r/20191218192606.12866-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/pci/ice1712/ice1724.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/sound/pci/ice1712/ice1724.c b/sound/pci/ice1712/ice1724.c index c80a16ee6e76..242542e23d28 100644 --- a/sound/pci/ice1712/ice1724.c +++ b/sound/pci/ice1712/ice1724.c @@ -647,6 +647,7 @@ static int snd_vt1724_set_pro_rate(struct snd_ice1712 *ice, unsigned int rate, unsigned long flags; unsigned char mclk_change; unsigned int i, old_rate; + bool call_set_rate = false; if (rate > ice->hw_rates->list[ice->hw_rates->count - 1]) return -EINVAL; @@ -670,7 +671,7 @@ static int snd_vt1724_set_pro_rate(struct snd_ice1712 *ice, unsigned int rate, * setting clock rate for internal clock mode */ old_rate = ice->get_rate(ice); if (force || (old_rate != rate)) - ice->set_rate(ice, rate); + call_set_rate = true; else if (rate == ice->cur_rate) { spin_unlock_irqrestore(&ice->reg_lock, flags); return 0; @@ -678,12 +679,14 @@ static int snd_vt1724_set_pro_rate(struct snd_ice1712 *ice, unsigned int rate, } ice->cur_rate = rate; + spin_unlock_irqrestore(&ice->reg_lock, flags); + + if (call_set_rate) + ice->set_rate(ice, rate); /* setting master clock */ mclk_change = ice->set_mclk(ice, rate); - spin_unlock_irqrestore(&ice->reg_lock, flags); - if (mclk_change && ice->gpio.i2s_mclk_changed) ice->gpio.i2s_mclk_changed(ice); if (ice->gpio.set_pro_rate) From 314bd842d98e1035cc40b671a71e07f48420e58f Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Sun, 29 Dec 2019 13:40:22 +0200 Subject: [PATCH 083/146] mlxsw: spectrum_router: Skip loopback RIFs during MAC validation When a router interface (RIF) is created the MAC address of the backing netdev is verified to have the same MSBs as existing RIFs. This is required in order to avoid changing existing RIF MAC addresses that all share the same MSBs. Loopback RIFs are special in this regard as they do not have a MAC address, given they are only used to loop packets from the overlay to the underlay. Without this change, an error is returned when trying to create a RIF after the creation of a GRE tunnel that is represented by a loopback RIF. 'rif->dev->dev_addr' points to the GRE device's local IP, which does not share the same MSBs as physical interfaces. Adding an IP address to any physical interface results in: Error: mlxsw_spectrum: All router interface MAC addresses must have the same prefix. Fix this by skipping loopback RIFs during MAC validation. Fixes: 74bc99397438 ("mlxsw: spectrum_router: Veto unsupported RIF MAC addresses") Signed-off-by: Amit Cohen Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 08b7e9f964da..8290e82240fc 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -7079,6 +7079,9 @@ static int mlxsw_sp_router_port_check_rif_addr(struct mlxsw_sp *mlxsw_sp, for (i = 0; i < MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_RIFS); i++) { rif = mlxsw_sp->router->rifs[i]; + if (rif && rif->ops && + rif->ops->type == MLXSW_SP_RIF_TYPE_IPIP_LB) + continue; if (rif && rif->dev && rif->dev != dev && !ether_addr_equal_masked(rif->dev->dev_addr, dev_addr, mlxsw_sp->mac_mask)) { From acca789a358cc960be3937851d7de6591c79d6c2 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 29 Dec 2019 13:40:23 +0200 Subject: [PATCH 084/146] mlxsw: spectrum: Use dedicated policer for VRRP packets Currently, VRRP packets and packets that hit exceptions during routing (e.g., MTU error) are policed using the same policer towards the CPU. This means, for example, that misconfiguration of the MTU on a routed interface can prevent VRRP packets from reaching the CPU, which in turn can cause the VRRP daemon to assume it is the Master router. Fix this by using a dedicated policer for VRRP packets. Fixes: 11566d34f895 ("mlxsw: spectrum: Add VRRP traps") Signed-off-by: Ido Schimmel Reported-by: Alex Veber Tested-by: Alex Veber Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/reg.h | 1 + drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 9 +++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/reg.h b/drivers/net/ethernet/mellanox/mlxsw/reg.h index 5294a1622643..af30e8a76682 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/reg.h +++ b/drivers/net/ethernet/mellanox/mlxsw/reg.h @@ -5472,6 +5472,7 @@ enum mlxsw_reg_htgt_trap_group { MLXSW_REG_HTGT_TRAP_GROUP_SP_LBERROR, MLXSW_REG_HTGT_TRAP_GROUP_SP_PTP0, MLXSW_REG_HTGT_TRAP_GROUP_SP_PTP1, + MLXSW_REG_HTGT_TRAP_GROUP_SP_VRRP, __MLXSW_REG_HTGT_TRAP_GROUP_MAX, MLXSW_REG_HTGT_TRAP_GROUP_MAX = __MLXSW_REG_HTGT_TRAP_GROUP_MAX - 1 diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 556dca328bb5..f7fd5e8fbf96 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -4542,8 +4542,8 @@ static const struct mlxsw_listener mlxsw_sp_listener[] = { MLXSW_SP_RXL_MARK(ROUTER_ALERT_IPV6, TRAP_TO_CPU, ROUTER_EXP, false), MLXSW_SP_RXL_MARK(IPIP_DECAP_ERROR, TRAP_TO_CPU, ROUTER_EXP, false), MLXSW_SP_RXL_MARK(DECAP_ECN0, TRAP_TO_CPU, ROUTER_EXP, false), - MLXSW_SP_RXL_MARK(IPV4_VRRP, TRAP_TO_CPU, ROUTER_EXP, false), - MLXSW_SP_RXL_MARK(IPV6_VRRP, TRAP_TO_CPU, ROUTER_EXP, false), + MLXSW_SP_RXL_MARK(IPV4_VRRP, TRAP_TO_CPU, VRRP, false), + MLXSW_SP_RXL_MARK(IPV6_VRRP, TRAP_TO_CPU, VRRP, false), /* PKT Sample trap */ MLXSW_RXL(mlxsw_sp_rx_listener_sample_func, PKT_SAMPLE, MIRROR_TO_CPU, false, SP_IP2ME, DISCARD), @@ -4626,6 +4626,10 @@ static int mlxsw_sp_cpu_policers_set(struct mlxsw_core *mlxsw_core) rate = 19 * 1024; burst_size = 12; break; + case MLXSW_REG_HTGT_TRAP_GROUP_SP_VRRP: + rate = 360; + burst_size = 7; + break; default: continue; } @@ -4665,6 +4669,7 @@ static int mlxsw_sp_trap_groups_set(struct mlxsw_core *mlxsw_core) case MLXSW_REG_HTGT_TRAP_GROUP_SP_OSPF: case MLXSW_REG_HTGT_TRAP_GROUP_SP_PIM: case MLXSW_REG_HTGT_TRAP_GROUP_SP_PTP0: + case MLXSW_REG_HTGT_TRAP_GROUP_SP_VRRP: priority = 5; tc = 5; break; From 48e01504cf5315cbe6de9b7412e792bfcc3dd9e1 Mon Sep 17 00:00:00 2001 From: Chris Chiu Date: Mon, 30 Dec 2019 11:11:18 +0800 Subject: [PATCH 085/146] ALSA: hda/realtek - Enable the bass speaker of ASUS UX431FLC ASUS reported that there's an bass speaker in addition to internal speaker and it uses DAC 0x02. It was not enabled in the commit 436e25505f34 ("ALSA: hda/realtek - Enable internal speaker of ASUS UX431FLC") which only enables the amplifier and the front speaker. This commit enables the bass speaker on top of the aforementioned work to improve the acoustic experience. Fixes: 436e25505f34 ("ALSA: hda/realtek - Enable internal speaker of ASUS UX431FLC") Signed-off-by: Chris Chiu Signed-off-by: Jian-Hong Pan Cc: Link: https://lore.kernel.org/r/20191230031118.95076-1-chiu@endlessm.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 38 +++++++++++++++++------------------ 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 2ee703c2da78..1cd4906a67e1 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -5905,11 +5905,12 @@ enum { ALC256_FIXUP_ASUS_HEADSET_MIC, ALC256_FIXUP_ASUS_MIC_NO_PRESENCE, ALC299_FIXUP_PREDATOR_SPK, - ALC294_FIXUP_ASUS_INTSPK_HEADSET_MIC, ALC256_FIXUP_MEDION_HEADSET_NO_PRESENCE, - ALC294_FIXUP_ASUS_INTSPK_GPIO, ALC289_FIXUP_DELL_SPK2, ALC289_FIXUP_DUAL_SPK, + ALC294_FIXUP_SPK2_TO_DAC1, + ALC294_FIXUP_ASUS_DUAL_SPK, + }; static const struct hda_fixup alc269_fixups[] = { @@ -6984,16 +6985,6 @@ static const struct hda_fixup alc269_fixups[] = { { } } }, - [ALC294_FIXUP_ASUS_INTSPK_HEADSET_MIC] = { - .type = HDA_FIXUP_PINS, - .v.pins = (const struct hda_pintbl[]) { - { 0x14, 0x411111f0 }, /* disable confusing internal speaker */ - { 0x19, 0x04a11150 }, /* use as headset mic, without its own jack detect */ - { } - }, - .chained = true, - .chain_id = ALC269_FIXUP_HEADSET_MODE_NO_HP_MIC - }, [ALC256_FIXUP_MEDION_HEADSET_NO_PRESENCE] = { .type = HDA_FIXUP_PINS, .v.pins = (const struct hda_pintbl[]) { @@ -7004,13 +6995,6 @@ static const struct hda_fixup alc269_fixups[] = { .chained = true, .chain_id = ALC256_FIXUP_ASUS_HEADSET_MODE }, - [ALC294_FIXUP_ASUS_INTSPK_GPIO] = { - .type = HDA_FIXUP_FUNC, - /* The GPIO must be pulled to initialize the AMP */ - .v.func = alc_fixup_gpio4, - .chained = true, - .chain_id = ALC294_FIXUP_ASUS_INTSPK_HEADSET_MIC - }, [ALC289_FIXUP_DELL_SPK2] = { .type = HDA_FIXUP_PINS, .v.pins = (const struct hda_pintbl[]) { @@ -7026,6 +7010,20 @@ static const struct hda_fixup alc269_fixups[] = { .chained = true, .chain_id = ALC289_FIXUP_DELL_SPK2 }, + [ALC294_FIXUP_SPK2_TO_DAC1] = { + .type = HDA_FIXUP_FUNC, + .v.func = alc285_fixup_speaker2_to_dac1, + .chained = true, + .chain_id = ALC294_FIXUP_ASUS_HEADSET_MIC + }, + [ALC294_FIXUP_ASUS_DUAL_SPK] = { + .type = HDA_FIXUP_FUNC, + /* The GPIO must be pulled to initialize the AMP */ + .v.func = alc_fixup_gpio4, + .chained = true, + .chain_id = ALC294_FIXUP_SPK2_TO_DAC1 + }, + }; static const struct snd_pci_quirk alc269_fixup_tbl[] = { @@ -7187,7 +7185,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1043, 0x1427, "Asus Zenbook UX31E", ALC269VB_FIXUP_ASUS_ZENBOOK), SND_PCI_QUIRK(0x1043, 0x1517, "Asus Zenbook UX31A", ALC269VB_FIXUP_ASUS_ZENBOOK_UX31A), SND_PCI_QUIRK(0x1043, 0x16e3, "ASUS UX50", ALC269_FIXUP_STEREO_DMIC), - SND_PCI_QUIRK(0x1043, 0x17d1, "ASUS UX431FL", ALC294_FIXUP_ASUS_INTSPK_GPIO), + SND_PCI_QUIRK(0x1043, 0x17d1, "ASUS UX431FL", ALC294_FIXUP_ASUS_DUAL_SPK), SND_PCI_QUIRK(0x1043, 0x18b1, "Asus MJ401TA", ALC256_FIXUP_ASUS_HEADSET_MIC), SND_PCI_QUIRK(0x1043, 0x1a13, "Asus G73Jw", ALC269_FIXUP_ASUS_G73JW), SND_PCI_QUIRK(0x1043, 0x1a30, "ASUS X705UD", ALC256_FIXUP_ASUS_MIC), From 6da3eced8c5f3b03340b0c395bacd552c4d52411 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Mon, 23 Dec 2019 14:31:47 +0100 Subject: [PATCH 086/146] powerpc/spinlocks: Include correct header for static key MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Recently, the spinlock implementation grew a static key optimization, but the jump_label.h header include was left out, leading to build errors: linux/arch/powerpc/include/asm/spinlock.h:44:7: error: implicit declaration of function ‘static_branch_unlikely’ 44 | if (!static_branch_unlikely(&shared_processor)) This commit adds the missing header. mpe: The build break is only seen with CONFIG_JUMP_LABEL=n. Fixes: 656c21d6af5d ("powerpc/shared: Use static key to detect shared processor") Signed-off-by: Jason A. Donenfeld Reviewed-by: Srikar Dronamraju Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20191223133147.129983-1-Jason@zx2c4.com --- arch/powerpc/include/asm/spinlock.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h index 1b55fc08f853..860228e917dc 100644 --- a/arch/powerpc/include/asm/spinlock.h +++ b/arch/powerpc/include/asm/spinlock.h @@ -15,6 +15,7 @@ * * (the type definitions are in asm/spinlock_types.h) */ +#include #include #ifdef CONFIG_PPC64 #include From 7b62e66cbbfb463a39bf83e30bdbbb4b9e83fa03 Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Wed, 11 Dec 2019 16:07:06 -0800 Subject: [PATCH 087/146] btrfs: punt all bios created in btrfs_submit_compressed_write() Compressed writes happen in the background via kworkers. However, this causes bios to be attributed to root bypassing any cgroup limits from the actual writer. We tag the first bio with REQ_CGROUP_PUNT, which will punt the bio to an appropriate cgroup specific workqueue and attribute the IO properly. However, if btrfs_submit_compressed_write() creates a new bio, we don't tag it the same way. Add the appropriate tagging for subsequent bios. Fixes: ec39f7696ccfa ("Btrfs: use REQ_CGROUP_PUNT for worker thread submitted bios") Reviewed-by: Chris Mason Signed-off-by: Dennis Zhou Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index ee834ef7beb4..b08e16b8cebb 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -491,6 +491,10 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, bio->bi_opf = REQ_OP_WRITE | write_flags; bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; + if (blkcg_css) { + bio->bi_opf |= REQ_CGROUP_PUNT; + bio_associate_blkg_from_css(bio, blkcg_css); + } bio_add_page(bio, page, PAGE_SIZE, 0); } if (bytes_left < PAGE_SIZE) { From 46bcff2bfc5e6a8c638d3a32e4f6f6fa4bd01461 Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Wed, 11 Dec 2019 15:20:15 -0800 Subject: [PATCH 088/146] btrfs: fix compressed write bio blkcg attribution Bio attribution is handled at bio_set_dev() as once we have a device, we have a corresponding request_queue and then can derive the current css. In special cases, we want to attribute to bio to someone else. This can be done by calling bio_associate_blkg_from_css() or kthread_associate_blkcg() depending on the scenario. Btrfs does this for compressed writeback as they are handled by kworkers, so the latter can be done here. Commit 1a41802701ec ("btrfs: drop bio_set_dev where not needed") removes early bio_set_dev() calls prior to submit_stripe_bio(). This breaks the above assumption that we'll have a request_queue when we are doing association. To fix this, switch to using kthread_associate_blkcg(). Without this, we crash in btrfs/024: [ 3052.093088] BUG: kernel NULL pointer dereference, address: 0000000000000510 [ 3052.107013] #PF: supervisor read access in kernel mode [ 3052.107014] #PF: error_code(0x0000) - not-present page [ 3052.107015] PGD 0 P4D 0 [ 3052.107021] Oops: 0000 [#1] SMP [ 3052.138904] CPU: 42 PID: 201270 Comm: kworker/u161:0 Kdump: loaded Not tainted 5.5.0-rc1-00062-g4852d8ac90a9 #712 [ 3052.138905] Hardware name: Quanta Tioga Pass Single Side 01-0032211004/Tioga Pass Single Side, BIOS F08_3A18 12/20/2018 [ 3052.138912] Workqueue: btrfs-delalloc btrfs_work_helper [ 3052.191375] RIP: 0010:bio_associate_blkg_from_css+0x1e/0x3c0 [ 3052.191379] RSP: 0018:ffffc900210cfc90 EFLAGS: 00010282 [ 3052.191380] RAX: 0000000000000000 RBX: ffff88bfe5573c00 RCX: 0000000000000000 [ 3052.191382] RDX: ffff889db48ec2f0 RSI: ffff88bfe5573c00 RDI: ffff889db48ec2f0 [ 3052.191386] RBP: 0000000000000800 R08: 0000000000203bb0 R09: ffff889db16b2400 [ 3052.293364] R10: 0000000000000000 R11: ffff88a07fffde80 R12: ffff889db48ec2f0 [ 3052.293365] R13: 0000000000001000 R14: ffff889de82bc000 R15: ffff889e2b7bdcc8 [ 3052.293367] FS: 0000000000000000(0000) GS:ffff889ffba00000(0000) knlGS:0000000000000000 [ 3052.293368] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 3052.293369] CR2: 0000000000000510 CR3: 0000000002611001 CR4: 00000000007606e0 [ 3052.293370] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 3052.293371] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 3052.293372] PKRU: 55555554 [ 3052.293376] Call Trace: [ 3052.402552] btrfs_submit_compressed_write+0x137/0x390 [ 3052.402558] submit_compressed_extents+0x40f/0x4c0 [ 3052.422401] btrfs_work_helper+0x246/0x5a0 [ 3052.422408] process_one_work+0x200/0x570 [ 3052.438601] ? process_one_work+0x180/0x570 [ 3052.438605] worker_thread+0x4c/0x3e0 [ 3052.438614] kthread+0x103/0x140 [ 3052.460735] ? process_one_work+0x570/0x570 [ 3052.460737] ? kthread_mod_delayed_work+0xc0/0xc0 [ 3052.460744] ret_from_fork+0x24/0x30 Fixes: 1a41802701ec ("btrfs: drop bio_set_dev where not needed") Reported-by: Chris Murphy Signed-off-by: Dennis Zhou Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index b08e16b8cebb..43e1660f450f 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -447,7 +447,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, if (blkcg_css) { bio->bi_opf |= REQ_CGROUP_PUNT; - bio_associate_blkg_from_css(bio, blkcg_css); + kthread_associate_blkcg(blkcg_css); } refcount_set(&cb->pending_bios, 1); @@ -491,10 +491,8 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, bio->bi_opf = REQ_OP_WRITE | write_flags; bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; - if (blkcg_css) { + if (blkcg_css) bio->bi_opf |= REQ_CGROUP_PUNT; - bio_associate_blkg_from_css(bio, blkcg_css); - } bio_add_page(bio, page, PAGE_SIZE, 0); } if (bytes_left < PAGE_SIZE) { @@ -521,6 +519,9 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, bio_endio(bio); } + if (blkcg_css) + kthread_associate_blkcg(NULL); + return 0; } From de7999afedff02c6631feab3ea726a0e8f8c3d40 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 11 Dec 2019 09:01:40 +0000 Subject: [PATCH 089/146] Btrfs: fix infinite loop during nocow writeback due to race When starting writeback for a range that covers part of a preallocated extent, due to a race with writeback for another range that also covers another part of the same preallocated extent, we can end up in an infinite loop. Consider the following example where for inode 280 we have two dirty ranges: range A, from 294912 to 303103, 8192 bytes range B, from 348160 to 438271, 90112 bytes and we have the following file extent item layout for our inode: leaf 38895616 gen 24544 total ptrs 29 free space 13820 owner 5 (...) item 27 key (280 108 200704) itemoff 14598 itemsize 53 extent data disk bytenr 0 nr 0 type 1 (regular) extent data offset 0 nr 94208 ram 94208 item 28 key (280 108 294912) itemoff 14545 itemsize 53 extent data disk bytenr 10433052672 nr 81920 type 2 (prealloc) extent data offset 0 nr 81920 ram 81920 Then the following happens: 1) Writeback starts for range B (from 348160 to 438271), execution of run_delalloc_nocow() starts; 2) The first iteration of run_delalloc_nocow()'s whil loop leaves us at the extent item at slot 28, pointing to the prealloc extent item covering the range from 294912 to 376831. This extent covers part of our range; 3) An ordered extent is created against that extent, covering the file range from 348160 to 376831 (28672 bytes); 4) We adjust 'cur_offset' to 376832 and move on to the next iteration of the while loop; 5) The call to btrfs_lookup_file_extent() leaves us at the same leaf, pointing to slot 29, 1 slot after the last item (the extent item we processed in the previous iteration); 6) Because we are a slot beyond the last item, we call btrfs_next_leaf(), which releases the search path before doing a another search for the last key of the leaf (280 108 294912); 7) Right after btrfs_next_leaf() released the path, and before it did another search for the last key of the leaf, writeback for the range A (from 294912 to 303103) completes (it was previously started at some point); 8) Upon completion of the ordered extent for range A, the prealloc extent we previously found got split into two extent items, one covering the range from 294912 to 303103 (8192 bytes), with a type of regular extent (and no longer prealloc) and another covering the range from 303104 to 376831 (73728 bytes), with a type of prealloc and an offset of 8192 bytes. So our leaf now has the following layout: leaf 38895616 gen 24544 total ptrs 31 free space 13664 owner 5 (...) item 27 key (280 108 200704) itemoff 14598 itemsize 53 extent data disk bytenr 0 nr 0 type 1 extent data offset 0 nr 8192 ram 94208 item 28 key (280 108 208896) itemoff 14545 itemsize 53 extent data disk bytenr 10433142784 nr 86016 type 1 extent data offset 0 nr 86016 ram 86016 item 29 key (280 108 294912) itemoff 14492 itemsize 53 extent data disk bytenr 10433052672 nr 81920 type 1 extent data offset 0 nr 8192 ram 81920 item 30 key (280 108 303104) itemoff 14439 itemsize 53 extent data disk bytenr 10433052672 nr 81920 type 2 extent data offset 8192 nr 73728 ram 81920 9) After btrfs_next_leaf() returns, we have our path pointing to that same leaf and at slot 30, since it has a key we didn't have before and it's the first key greater then the key that was previously the last key of the leaf (key (280 108 294912)); 10) The extent item at slot 30 covers the range from 303104 to 376831 which is in our target range, so we process it, despite having already created an ordered extent against this extent for the file range from 348160 to 376831. This is because we skip to the next extent item only if its end is less than or equals to the start of our delalloc range, and not less than or equals to the current offset ('cur_offset'); 11) As a result we compute 'num_bytes' as: num_bytes = min(end + 1, extent_end) - cur_offset; = min(438271 + 1, 376832) - 376832 = 0 12) We then call create_io_em() for a 0 bytes range starting at offset 376832; 13) Then create_io_em() enters an infinite loop because its calls to btrfs_drop_extent_cache() do nothing due to the 0 length range passed to it. So no existing extent maps that cover the offset 376832 get removed, and therefore calls to add_extent_mapping() return -EEXIST, resulting in an infinite loop. This loop from create_io_em() is the following: do { btrfs_drop_extent_cache(BTRFS_I(inode), em->start, em->start + em->len - 1, 0); write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em, 1); write_unlock(&em_tree->lock); /* * The caller has taken lock_extent(), who could race with us * to add em? */ } while (ret == -EEXIST); Also, each call to btrfs_drop_extent_cache() triggers a warning because the start offset passed to it (376832) is smaller then the end offset (376832 - 1) passed to it by -1, due to the 0 length: [258532.052621] ------------[ cut here ]------------ [258532.052643] WARNING: CPU: 0 PID: 9987 at fs/btrfs/file.c:602 btrfs_drop_extent_cache+0x3f4/0x590 [btrfs] (...) [258532.052672] CPU: 0 PID: 9987 Comm: fsx Tainted: G W 5.4.0-rc7-btrfs-next-64 #1 [258532.052673] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-0-ga698c8995f-prebuilt.qemu.org 04/01/2014 [258532.052691] RIP: 0010:btrfs_drop_extent_cache+0x3f4/0x590 [btrfs] (...) [258532.052695] RSP: 0018:ffffb4be0153f860 EFLAGS: 00010287 [258532.052700] RAX: ffff975b445ee360 RBX: ffff975b44eb3e08 RCX: 0000000000000000 [258532.052700] RDX: 0000000000038fff RSI: 0000000000039000 RDI: ffff975b445ee308 [258532.052700] RBP: 0000000000038fff R08: 0000000000000000 R09: 0000000000000001 [258532.052701] R10: ffff975b513c5c10 R11: 00000000e3c0cfa9 R12: 0000000000039000 [258532.052703] R13: ffff975b445ee360 R14: 00000000ffffffef R15: ffff975b445ee308 [258532.052705] FS: 00007f86a821de80(0000) GS:ffff975b76a00000(0000) knlGS:0000000000000000 [258532.052707] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [258532.052708] CR2: 00007fdacf0f3ab4 CR3: 00000001f9d26002 CR4: 00000000003606f0 [258532.052712] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [258532.052717] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [258532.052717] Call Trace: [258532.052718] ? preempt_schedule_common+0x32/0x70 [258532.052722] ? ___preempt_schedule+0x16/0x20 [258532.052741] create_io_em+0xff/0x180 [btrfs] [258532.052767] run_delalloc_nocow+0x942/0xb10 [btrfs] [258532.052791] btrfs_run_delalloc_range+0x30b/0x520 [btrfs] [258532.052812] ? find_lock_delalloc_range+0x221/0x250 [btrfs] [258532.052834] writepage_delalloc+0xe4/0x140 [btrfs] [258532.052855] __extent_writepage+0x110/0x4e0 [btrfs] [258532.052876] extent_write_cache_pages+0x21c/0x480 [btrfs] [258532.052906] extent_writepages+0x52/0xb0 [btrfs] [258532.052911] do_writepages+0x23/0x80 [258532.052915] __filemap_fdatawrite_range+0xd2/0x110 [258532.052938] btrfs_fdatawrite_range+0x1b/0x50 [btrfs] [258532.052954] start_ordered_ops+0x57/0xa0 [btrfs] [258532.052973] ? btrfs_sync_file+0x225/0x490 [btrfs] [258532.052988] btrfs_sync_file+0x225/0x490 [btrfs] [258532.052997] __x64_sys_msync+0x199/0x200 [258532.053004] do_syscall_64+0x5c/0x250 [258532.053007] entry_SYSCALL_64_after_hwframe+0x49/0xbe [258532.053010] RIP: 0033:0x7f86a7dfd760 (...) [258532.053014] RSP: 002b:00007ffd99af0368 EFLAGS: 00000246 ORIG_RAX: 000000000000001a [258532.053016] RAX: ffffffffffffffda RBX: 0000000000000ec9 RCX: 00007f86a7dfd760 [258532.053017] RDX: 0000000000000004 RSI: 000000000000836c RDI: 00007f86a8221000 [258532.053019] RBP: 0000000000021ec9 R08: 0000000000000003 R09: 00007f86a812037c [258532.053020] R10: 0000000000000001 R11: 0000000000000246 R12: 00000000000074a3 [258532.053021] R13: 00007f86a8221000 R14: 000000000000836c R15: 0000000000000001 [258532.053032] irq event stamp: 1653450494 [258532.053035] hardirqs last enabled at (1653450493): [] _raw_spin_unlock_irq+0x29/0x50 [258532.053037] hardirqs last disabled at (1653450494): [] trace_hardirqs_off_thunk+0x1a/0x20 [258532.053039] softirqs last enabled at (1653449852): [] __do_softirq+0x466/0x6bd [258532.053042] softirqs last disabled at (1653449845): [] irq_exit+0xec/0x120 [258532.053043] ---[ end trace 8476fce13d9ce20a ]--- Which results in flooding dmesg/syslog since btrfs_drop_extent_cache() uses WARN_ON() and not WARN_ON_ONCE(). So fix this issue by changing run_delalloc_nocow()'s loop to move to the next extent item when the current extent item ends at at offset less than or equals to the current offset instead of the start offset. Fixes: 80ff385665b7fc ("Btrfs: update nodatacow code v2") CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/inode.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e3c76645cad7..5509c41a4f43 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1479,10 +1479,10 @@ static noinline int run_delalloc_nocow(struct inode *inode, disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); /* - * If extent we got ends before our range starts, skip - * to next extent + * If the extent we got ends before our current offset, + * skip to the next extent. */ - if (extent_end <= start) { + if (extent_end <= cur_offset) { path->slots[0]++; goto next_slot; } From 429120f3df2dba2bf3a4a19f4212a53ecefc7102 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 29 Dec 2019 10:32:30 +0800 Subject: [PATCH 090/146] block: fix splitting segments on boundary masks We ran into a problem with a mpt3sas based controller, where we would see random (and hard to reproduce) file corruption). The issue seemed specific to this controller, but wasn't specific to the file system. After a lot of debugging, we find out that it's caused by segments spanning a 4G memory boundary. This shouldn't happen, as the default setting for segment boundary masks is 4G. Turns out there are two issues in get_max_segment_size(): 1) The default segment boundary mask is bypassed 2) The segment start address isn't taken into account when checking segment boundary limit Fix these two issues by removing the bypass of the segment boundary check even if the mask is set to the default value, and taking into account the actual start address of the request when checking if a segment needs splitting. Cc: stable@vger.kernel.org # v5.1+ Reviewed-by: Chris Mason Tested-by: Chris Mason Fixes: dcebd755926b ("block: use bio_for_each_bvec() to compute multi-page bvec count") Signed-off-by: Ming Lei Dropped const on the page pointer, ppc page_to_phys() doesn't mark the page as const... Signed-off-by: Jens Axboe --- block/blk-merge.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index d783bdc4559b..347782a24a35 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -157,16 +157,14 @@ static inline unsigned get_max_io_size(struct request_queue *q, return sectors & (lbs - 1); } -static unsigned get_max_segment_size(const struct request_queue *q, - unsigned offset) +static inline unsigned get_max_segment_size(const struct request_queue *q, + struct page *start_page, + unsigned long offset) { unsigned long mask = queue_segment_boundary(q); - /* default segment boundary mask means no boundary limit */ - if (mask == BLK_SEG_BOUNDARY_MASK) - return queue_max_segment_size(q); - - return min_t(unsigned long, mask - (mask & offset) + 1, + offset = mask & (page_to_phys(start_page) + offset); + return min_t(unsigned long, mask - offset + 1, queue_max_segment_size(q)); } @@ -201,7 +199,8 @@ static bool bvec_split_segs(const struct request_queue *q, unsigned seg_size = 0; while (len && *nsegs < max_segs) { - seg_size = get_max_segment_size(q, bv->bv_offset + total_len); + seg_size = get_max_segment_size(q, bv->bv_page, + bv->bv_offset + total_len); seg_size = min(seg_size, len); (*nsegs)++; @@ -419,7 +418,8 @@ static unsigned blk_bvec_map_sg(struct request_queue *q, while (nbytes > 0) { unsigned offset = bvec->bv_offset + total; - unsigned len = min(get_max_segment_size(q, offset), nbytes); + unsigned len = min(get_max_segment_size(q, bvec->bv_page, + offset), nbytes); struct page *page = bvec->bv_page; /* From c7d776f85dfe5159ebf621ee1e50e555237b1a25 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 26 Dec 2019 15:54:25 +0900 Subject: [PATCH 091/146] null_blk: Fix REQ_OP_ZONE_CLOSE handling In order to match ZBC defined behavior, closing an empty zone must result in the "empty" zone condition instead of the "closed" condition. Fixes: da644b2cc1a4 ("null_blk: add zone open, close, and finish support") Signed-off-by: Damien Le Moal Signed-off-by: Jens Axboe --- drivers/block/null_blk_zoned.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c index d4d88b581822..5cf49d9db95e 100644 --- a/drivers/block/null_blk_zoned.c +++ b/drivers/block/null_blk_zoned.c @@ -186,7 +186,10 @@ static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op, if (zone->cond == BLK_ZONE_COND_FULL) return BLK_STS_IOERR; - zone->cond = BLK_ZONE_COND_CLOSED; + if (zone->wp == zone->start) + zone->cond = BLK_ZONE_COND_EMPTY; + else + zone->cond = BLK_ZONE_COND_CLOSED; break; case REQ_OP_ZONE_FINISH: if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) From 1f07dcc459d5f2c639f185f6e94829a0c79f2b4c Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 30 Dec 2019 12:01:56 -0800 Subject: [PATCH 092/146] kernel.h: Remove unused FIELD_SIZEOF() Now that all callers of FIELD_SIZEOF() have been converted to sizeof_field(), remove the unused prior macro. Signed-off-by: Kees Cook --- include/linux/kernel.h | 9 --------- 1 file changed, 9 deletions(-) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 3adcb39fa6f5..0d9db2a14f44 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -79,15 +79,6 @@ */ #define round_down(x, y) ((x) & ~__round_mask(x, y)) -/** - * FIELD_SIZEOF - get the size of a struct's field - * @t: the target struct - * @f: the target struct's field - * Return: the size of @f in the struct definition without having a - * declared instance of @t. - */ -#define FIELD_SIZEOF(t, f) (sizeof(((t*)0)->f)) - #define typeof_member(T, m) typeof(((T*)0)->m) #define DIV_ROUND_UP __KERNEL_DIV_ROUND_UP From 0caeaf6ad532f9be5a768a158627cb31921cc8b7 Mon Sep 17 00:00:00 2001 From: Rahul Lakkireddy Date: Mon, 30 Dec 2019 18:14:08 +0530 Subject: [PATCH 093/146] cxgb4/cxgb4vf: fix flow control display for auto negotiation As per 802.3-2005, Section Two, Annex 28B, Table 28B-2 [1], when _only_ Rx pause is enabled, both symmetric and asymmetric pause towards local device must be enabled. Also, firmware returns the local device's flow control pause params as part of advertised capabilities and negotiated params as part of current link attributes. So, fix up ethtool's flow control pause params fetch logic to read from acaps, instead of linkattr. [1] https://standards.ieee.org/standard/802_3-2005.html Fixes: c3168cabe1af ("cxgb4/cxgbvf: Handle 32-bit fw port capabilities") Signed-off-by: Surendra Mobiya Signed-off-by: Rahul Lakkireddy Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/cxgb4.h | 1 + .../ethernet/chelsio/cxgb4/cxgb4_ethtool.c | 4 ++-- drivers/net/ethernet/chelsio/cxgb4/t4_hw.c | 23 +++++++++++-------- .../ethernet/chelsio/cxgb4vf/cxgb4vf_main.c | 4 ++-- .../ethernet/chelsio/cxgb4vf/t4vf_common.h | 1 + .../net/ethernet/chelsio/cxgb4vf/t4vf_hw.c | 20 +++++++++------- 6 files changed, 32 insertions(+), 21 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h index a70ac2097892..becee29f5df7 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h @@ -504,6 +504,7 @@ struct link_config { enum cc_pause requested_fc; /* flow control user has requested */ enum cc_pause fc; /* actual link flow control */ + enum cc_pause advertised_fc; /* actual advertised flow control */ enum cc_fec requested_fec; /* Forward Error Correction: */ enum cc_fec fec; /* requested and actual in use */ diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c index 20ab3b6285a2..c837382ee522 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c @@ -807,8 +807,8 @@ static void get_pauseparam(struct net_device *dev, struct port_info *p = netdev_priv(dev); epause->autoneg = (p->link_cfg.requested_fc & PAUSE_AUTONEG) != 0; - epause->rx_pause = (p->link_cfg.fc & PAUSE_RX) != 0; - epause->tx_pause = (p->link_cfg.fc & PAUSE_TX) != 0; + epause->rx_pause = (p->link_cfg.advertised_fc & PAUSE_RX) != 0; + epause->tx_pause = (p->link_cfg.advertised_fc & PAUSE_TX) != 0; } static int set_pauseparam(struct net_device *dev, diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c index 19d18acfc9a6..844fdcf55118 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c +++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c @@ -4089,7 +4089,8 @@ static inline fw_port_cap32_t cc_to_fwcap_pause(enum cc_pause cc_pause) if (cc_pause & PAUSE_TX) fw_pause |= FW_PORT_CAP32_802_3_PAUSE; else - fw_pause |= FW_PORT_CAP32_802_3_ASM_DIR; + fw_pause |= FW_PORT_CAP32_802_3_ASM_DIR | + FW_PORT_CAP32_802_3_PAUSE; } else if (cc_pause & PAUSE_TX) { fw_pause |= FW_PORT_CAP32_802_3_ASM_DIR; } @@ -8563,17 +8564,17 @@ static fw_port_cap32_t lstatus_to_fwcap(u32 lstatus) void t4_handle_get_port_info(struct port_info *pi, const __be64 *rpl) { const struct fw_port_cmd *cmd = (const void *)rpl; - int action = FW_PORT_CMD_ACTION_G(be32_to_cpu(cmd->action_to_len16)); - struct adapter *adapter = pi->adapter; - struct link_config *lc = &pi->link_cfg; - int link_ok, linkdnrc; - enum fw_port_type port_type; - enum fw_port_module_type mod_type; - unsigned int speed, fc, fec; fw_port_cap32_t pcaps, acaps, lpacaps, linkattr; + struct link_config *lc = &pi->link_cfg; + struct adapter *adapter = pi->adapter; + unsigned int speed, fc, fec, adv_fc; + enum fw_port_module_type mod_type; + int action, link_ok, linkdnrc; + enum fw_port_type port_type; /* Extract the various fields from the Port Information message. */ + action = FW_PORT_CMD_ACTION_G(be32_to_cpu(cmd->action_to_len16)); switch (action) { case FW_PORT_ACTION_GET_PORT_INFO: { u32 lstatus = be32_to_cpu(cmd->u.info.lstatus_to_modtype); @@ -8611,6 +8612,7 @@ void t4_handle_get_port_info(struct port_info *pi, const __be64 *rpl) } fec = fwcap_to_cc_fec(acaps); + adv_fc = fwcap_to_cc_pause(acaps); fc = fwcap_to_cc_pause(linkattr); speed = fwcap_to_speed(linkattr); @@ -8667,7 +8669,9 @@ void t4_handle_get_port_info(struct port_info *pi, const __be64 *rpl) } if (link_ok != lc->link_ok || speed != lc->speed || - fc != lc->fc || fec != lc->fec) { /* something changed */ + fc != lc->fc || adv_fc != lc->advertised_fc || + fec != lc->fec) { + /* something changed */ if (!link_ok && lc->link_ok) { lc->link_down_rc = linkdnrc; dev_warn_ratelimited(adapter->pdev_dev, @@ -8677,6 +8681,7 @@ void t4_handle_get_port_info(struct port_info *pi, const __be64 *rpl) } lc->link_ok = link_ok; lc->speed = speed; + lc->advertised_fc = adv_fc; lc->fc = fc; lc->fec = fec; diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c index f6fc0875d5b0..f4d41f968afa 100644 --- a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c @@ -1690,8 +1690,8 @@ static void cxgb4vf_get_pauseparam(struct net_device *dev, struct port_info *pi = netdev_priv(dev); pauseparam->autoneg = (pi->link_cfg.requested_fc & PAUSE_AUTONEG) != 0; - pauseparam->rx_pause = (pi->link_cfg.fc & PAUSE_RX) != 0; - pauseparam->tx_pause = (pi->link_cfg.fc & PAUSE_TX) != 0; + pauseparam->rx_pause = (pi->link_cfg.advertised_fc & PAUSE_RX) != 0; + pauseparam->tx_pause = (pi->link_cfg.advertised_fc & PAUSE_TX) != 0; } /* diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_common.h b/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_common.h index ccca67cf4487..57cfd10a99ec 100644 --- a/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_common.h +++ b/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_common.h @@ -135,6 +135,7 @@ struct link_config { enum cc_pause requested_fc; /* flow control user has requested */ enum cc_pause fc; /* actual link flow control */ + enum cc_pause advertised_fc; /* actual advertised flow control */ enum cc_fec auto_fec; /* Forward Error Correction: */ enum cc_fec requested_fec; /* "automatic" (IEEE 802.3), */ diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c b/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c index 8a389d617a23..9d49ff211cc1 100644 --- a/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c +++ b/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c @@ -1913,16 +1913,16 @@ static const char *t4vf_link_down_rc_str(unsigned char link_down_rc) static void t4vf_handle_get_port_info(struct port_info *pi, const struct fw_port_cmd *cmd) { - int action = FW_PORT_CMD_ACTION_G(be32_to_cpu(cmd->action_to_len16)); - struct adapter *adapter = pi->adapter; - struct link_config *lc = &pi->link_cfg; - int link_ok, linkdnrc; - enum fw_port_type port_type; - enum fw_port_module_type mod_type; - unsigned int speed, fc, fec; fw_port_cap32_t pcaps, acaps, lpacaps, linkattr; + struct link_config *lc = &pi->link_cfg; + struct adapter *adapter = pi->adapter; + unsigned int speed, fc, fec, adv_fc; + enum fw_port_module_type mod_type; + int action, link_ok, linkdnrc; + enum fw_port_type port_type; /* Extract the various fields from the Port Information message. */ + action = FW_PORT_CMD_ACTION_G(be32_to_cpu(cmd->action_to_len16)); switch (action) { case FW_PORT_ACTION_GET_PORT_INFO: { u32 lstatus = be32_to_cpu(cmd->u.info.lstatus_to_modtype); @@ -1982,6 +1982,7 @@ static void t4vf_handle_get_port_info(struct port_info *pi, } fec = fwcap_to_cc_fec(acaps); + adv_fc = fwcap_to_cc_pause(acaps); fc = fwcap_to_cc_pause(linkattr); speed = fwcap_to_speed(linkattr); @@ -2012,7 +2013,9 @@ static void t4vf_handle_get_port_info(struct port_info *pi, } if (link_ok != lc->link_ok || speed != lc->speed || - fc != lc->fc || fec != lc->fec) { /* something changed */ + fc != lc->fc || adv_fc != lc->advertised_fc || + fec != lc->fec) { + /* something changed */ if (!link_ok && lc->link_ok) { lc->link_down_rc = linkdnrc; dev_warn_ratelimited(adapter->pdev_dev, @@ -2022,6 +2025,7 @@ static void t4vf_handle_get_port_info(struct port_info *pi, } lc->link_ok = link_ok; lc->speed = speed; + lc->advertised_fc = adv_fc; lc->fc = fc; lc->fec = fec; From 9fcf024dd6fae082f05e8c1fcdae23972b2f6971 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 27 Dec 2019 02:59:54 +0200 Subject: [PATCH 094/146] net: dsa: sja1105: Take PTP egress timestamp by port, not mgmt slot The PTP egress timestamp N must be captured from register PTPEGR_TS[n], where n = 2 * PORT + TSREG. There are 10 PTPEGR_TS registers, 2 per port. We are only using TSREG=0. As opposed to the management slots, which are 4 in number (SJA1105_NUM_PORTS, minus the CPU port). Any management frame (which includes PTP frames) can be sent to any non-CPU port through any management slot. When the CPU port is not the last port (#4), there will be a mismatch between the slot and the port number. Luckily, the only mainline occurrence with this switch (arch/arm/boot/dts/ls1021a-tsn.dts) does have the CPU port as #4, so the issue did not manifest itself thus far. Fixes: 47ed985e97f5 ("net: dsa: sja1105: Add logic for TX timestamping") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/sja1105/sja1105_main.c | 2 +- drivers/net/dsa/sja1105/sja1105_ptp.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c index a51ac088c0bc..86bbab166633 100644 --- a/drivers/net/dsa/sja1105/sja1105_main.c +++ b/drivers/net/dsa/sja1105/sja1105_main.c @@ -1855,7 +1855,7 @@ static netdev_tx_t sja1105_port_deferred_xmit(struct dsa_switch *ds, int port, if (!clone) goto out; - sja1105_ptp_txtstamp_skb(ds, slot, clone); + sja1105_ptp_txtstamp_skb(ds, port, clone); out: mutex_unlock(&priv->mgmt_lock); diff --git a/drivers/net/dsa/sja1105/sja1105_ptp.c b/drivers/net/dsa/sja1105/sja1105_ptp.c index 54258a25031d..c0fda7db6271 100644 --- a/drivers/net/dsa/sja1105/sja1105_ptp.c +++ b/drivers/net/dsa/sja1105/sja1105_ptp.c @@ -659,7 +659,7 @@ void sja1105_ptp_clock_unregister(struct dsa_switch *ds) ptp_data->clock = NULL; } -void sja1105_ptp_txtstamp_skb(struct dsa_switch *ds, int slot, +void sja1105_ptp_txtstamp_skb(struct dsa_switch *ds, int port, struct sk_buff *skb) { struct sja1105_private *priv = ds->priv; @@ -679,7 +679,7 @@ void sja1105_ptp_txtstamp_skb(struct dsa_switch *ds, int slot, goto out; } - rc = sja1105_ptpegr_ts_poll(ds, slot, &ts); + rc = sja1105_ptpegr_ts_poll(ds, port, &ts); if (rc < 0) { dev_err(ds->dev, "timed out polling for tstamp\n"); kfree_skb(skb); From 5a47f588ee2366b2febdc822cdfdcf856cb0a777 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 27 Dec 2019 03:01:50 +0200 Subject: [PATCH 095/146] net: dsa: sja1105: Really make the PTP command read-write When activating tc-taprio offload on the switch ports, the TAS state machine will try to check whether it is running or not, but will find both the STARTED and STOPPED bits as false in the sja1105_tas_check_running function. So the function will return -EINVAL (an abnormal situation) and the kernel will keep printing this from the TAS FSM workqueue: [ 37.691971] sja1105 spi0.1: An operation returned -22 The reason is that the underlying function that gets called, sja1105_ptp_commit, does not actually do a SPI_READ, but a SPI_WRITE. So the command buffer remains initialized with zeroes instead of retrieving the hardware state. Fix that. Fixes: 41603d78b362 ("net: dsa: sja1105: Make the PTP command read-write") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/sja1105/sja1105_ptp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/sja1105/sja1105_ptp.c b/drivers/net/dsa/sja1105/sja1105_ptp.c index c0fda7db6271..43ab7589d0d0 100644 --- a/drivers/net/dsa/sja1105/sja1105_ptp.c +++ b/drivers/net/dsa/sja1105/sja1105_ptp.c @@ -234,7 +234,7 @@ int sja1105_ptp_commit(struct dsa_switch *ds, struct sja1105_ptp_cmd *cmd, if (rw == SPI_WRITE) priv->info->ptp_cmd_packing(buf, cmd, PACK); - rc = sja1105_xfer_buf(priv, SPI_WRITE, regs->ptp_control, buf, + rc = sja1105_xfer_buf(priv, rw, regs->ptp_control, buf, SJA1105_SIZE_PTP_CMD); if (rw == SPI_READ) From d00bdc0a8839de9a5c9be5af2a79dbf8e0087689 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 27 Dec 2019 03:03:54 +0200 Subject: [PATCH 096/146] net: dsa: sja1105: Remove restriction of zero base-time for taprio offload The check originates from the initial implementation which was not based on PTP time but on a standalone clock source. In the meantime we can now program the PTPSCHTM register at runtime with the dynamic base time (actually with a value that is 200 ns smaller, to avoid writing DELTA=0 in the Schedule Entry Points Parameters Table). And we also have logic for moving the actual base time in the future of the PHC's current time base, so the check for zero serves no purpose, since even if the user will specify zero, that's not what will end up in the static config table where the limitation is. Fixes: 86db36a347b4 ("net: dsa: sja1105: Implement state machine for TAS with PTP clock source") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/sja1105/sja1105_tas.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/net/dsa/sja1105/sja1105_tas.c b/drivers/net/dsa/sja1105/sja1105_tas.c index 26b925b5dace..fa6750d973d7 100644 --- a/drivers/net/dsa/sja1105/sja1105_tas.c +++ b/drivers/net/dsa/sja1105/sja1105_tas.c @@ -477,11 +477,6 @@ int sja1105_setup_tc_taprio(struct dsa_switch *ds, int port, if (admin->cycle_time_extension) return -ENOTSUPP; - if (!ns_to_sja1105_delta(admin->base_time)) { - dev_err(ds->dev, "A base time of zero is not hardware-allowed\n"); - return -ERANGE; - } - for (i = 0; i < admin->num_entries; i++) { s64 delta_ns = admin->entries[i].interval; s64 delta_cycles = ns_to_sja1105_delta(delta_ns); From 3a323ed7c9c6d60af05bada0efe7a8ef56cf317f Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 27 Dec 2019 03:08:07 +0200 Subject: [PATCH 097/146] Documentation: net: dsa: sja1105: Remove text about taprio base-time limitation Since commit 86db36a347b4 ("net: dsa: sja1105: Implement state machine for TAS with PTP clock source"), this paragraph is no longer true. So remove it. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- Documentation/networking/dsa/sja1105.rst | 6 ------ 1 file changed, 6 deletions(-) diff --git a/Documentation/networking/dsa/sja1105.rst b/Documentation/networking/dsa/sja1105.rst index eef20d0bcf7c..64553d8d91cb 100644 --- a/Documentation/networking/dsa/sja1105.rst +++ b/Documentation/networking/dsa/sja1105.rst @@ -230,12 +230,6 @@ simultaneously on two ports. The driver checks the consistency of the schedules against this restriction and errors out when appropriate. Schedule analysis is needed to avoid this, which is outside the scope of the document. -At the moment, the time-aware scheduler can only be triggered based on a -standalone clock and not based on PTP time. This means the base-time argument -from tc-taprio is ignored and the schedule starts right away. It also means it -is more difficult to phase-align the scheduler with the other devices in the -network. - Device Tree bindings and board design ===================================== From 54fa49ee88138756df0fcf867cb1849904710a8c Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 27 Dec 2019 03:11:13 +0200 Subject: [PATCH 098/146] net: dsa: sja1105: Reconcile the meaning of TPID and TPID2 for E/T and P/Q/R/S For first-generation switches (SJA1105E and SJA1105T): - TPID means C-Tag (typically 0x8100) - TPID2 means S-Tag (typically 0x88A8) While for the second generation switches (SJA1105P, SJA1105Q, SJA1105R, SJA1105S) it is the other way around: - TPID means S-Tag (typically 0x88A8) - TPID2 means C-Tag (typically 0x8100) In other words, E/T tags untagged traffic with TPID, and P/Q/R/S with TPID2. So the patch mentioned below fixed VLAN filtering for P/Q/R/S, but broke it for E/T. We strive for a common code path for all switches in the family, so just lie in the static config packing functions that TPID and TPID2 are at swapped bit offsets than they actually are, for P/Q/R/S. This will make both switches understand TPID to be ETH_P_8021Q and TPID2 to be ETH_P_8021AD. The meaning from the original E/T was chosen over P/Q/R/S because E/T is actually the one with public documentation available (UM10944.pdf). Fixes: f9a1a7646c0d ("net: dsa: sja1105: Reverse TPID and TPID2") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/sja1105/sja1105_main.c | 8 ++++---- drivers/net/dsa/sja1105/sja1105_static_config.c | 7 +++++-- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c index 86bbab166633..1da5ac111499 100644 --- a/drivers/net/dsa/sja1105/sja1105_main.c +++ b/drivers/net/dsa/sja1105/sja1105_main.c @@ -1569,8 +1569,8 @@ static int sja1105_vlan_filtering(struct dsa_switch *ds, int port, bool enabled) if (enabled) { /* Enable VLAN filtering. */ - tpid = ETH_P_8021AD; - tpid2 = ETH_P_8021Q; + tpid = ETH_P_8021Q; + tpid2 = ETH_P_8021AD; } else { /* Disable VLAN filtering. */ tpid = ETH_P_SJA1105; @@ -1579,9 +1579,9 @@ static int sja1105_vlan_filtering(struct dsa_switch *ds, int port, bool enabled) table = &priv->static_config.tables[BLK_IDX_GENERAL_PARAMS]; general_params = table->entries; - /* EtherType used to identify outer tagged (S-tag) VLAN traffic */ - general_params->tpid = tpid; /* EtherType used to identify inner tagged (C-tag) VLAN traffic */ + general_params->tpid = tpid; + /* EtherType used to identify outer tagged (S-tag) VLAN traffic */ general_params->tpid2 = tpid2; /* When VLAN filtering is on, we need to at least be able to * decode management traffic through the "backup plan". diff --git a/drivers/net/dsa/sja1105/sja1105_static_config.c b/drivers/net/dsa/sja1105/sja1105_static_config.c index 0d03e13e9909..63d2311817c4 100644 --- a/drivers/net/dsa/sja1105/sja1105_static_config.c +++ b/drivers/net/dsa/sja1105/sja1105_static_config.c @@ -142,6 +142,9 @@ static size_t sja1105et_general_params_entry_packing(void *buf, void *entry_ptr, return size; } +/* TPID and TPID2 are intentionally reversed so that semantic + * compatibility with E/T is kept. + */ static size_t sja1105pqrs_general_params_entry_packing(void *buf, void *entry_ptr, enum packing_op op) @@ -166,9 +169,9 @@ sja1105pqrs_general_params_entry_packing(void *buf, void *entry_ptr, sja1105_packing(buf, &entry->mirr_port, 141, 139, size, op); sja1105_packing(buf, &entry->vlmarker, 138, 107, size, op); sja1105_packing(buf, &entry->vlmask, 106, 75, size, op); - sja1105_packing(buf, &entry->tpid, 74, 59, size, op); + sja1105_packing(buf, &entry->tpid2, 74, 59, size, op); sja1105_packing(buf, &entry->ignore2stf, 58, 58, size, op); - sja1105_packing(buf, &entry->tpid2, 57, 42, size, op); + sja1105_packing(buf, &entry->tpid, 57, 42, size, op); sja1105_packing(buf, &entry->queue_ts, 41, 41, size, op); sja1105_packing(buf, &entry->egrmirrvid, 40, 29, size, op); sja1105_packing(buf, &entry->egrmirrpcp, 28, 26, size, op); From a33121e5487b424339636b25c35d3a180eaa5f5e Mon Sep 17 00:00:00 2001 From: Vladis Dronov Date: Fri, 27 Dec 2019 03:26:27 +0100 Subject: [PATCH 099/146] ptp: fix the race between the release of ptp_clock and cdev In a case when a ptp chardev (like /dev/ptp0) is open but an underlying device is removed, closing this file leads to a race. This reproduces easily in a kvm virtual machine: ts# cat openptp0.c int main() { ... fp = fopen("/dev/ptp0", "r"); ... sleep(10); } ts# uname -r 5.5.0-rc3-46cf053e ts# cat /proc/cmdline ... slub_debug=FZP ts# modprobe ptp_kvm ts# ./openptp0 & [1] 670 opened /dev/ptp0, sleeping 10s... ts# rmmod ptp_kvm ts# ls /dev/ptp* ls: cannot access '/dev/ptp*': No such file or directory ts# ...woken up [ 48.010809] general protection fault: 0000 [#1] SMP [ 48.012502] CPU: 6 PID: 658 Comm: openptp0 Not tainted 5.5.0-rc3-46cf053e #25 [ 48.014624] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), ... [ 48.016270] RIP: 0010:module_put.part.0+0x7/0x80 [ 48.017939] RSP: 0018:ffffb3850073be00 EFLAGS: 00010202 [ 48.018339] RAX: 000000006b6b6b6b RBX: 6b6b6b6b6b6b6b6b RCX: ffff89a476c00ad0 [ 48.018936] RDX: fffff65a08d3ea08 RSI: 0000000000000247 RDI: 6b6b6b6b6b6b6b6b [ 48.019470] ... ^^^ a slub poison [ 48.023854] Call Trace: [ 48.024050] __fput+0x21f/0x240 [ 48.024288] task_work_run+0x79/0x90 [ 48.024555] do_exit+0x2af/0xab0 [ 48.024799] ? vfs_write+0x16a/0x190 [ 48.025082] do_group_exit+0x35/0x90 [ 48.025387] __x64_sys_exit_group+0xf/0x10 [ 48.025737] do_syscall_64+0x3d/0x130 [ 48.026056] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 48.026479] RIP: 0033:0x7f53b12082f6 [ 48.026792] ... [ 48.030945] Modules linked in: ptp i6300esb watchdog [last unloaded: ptp_kvm] [ 48.045001] Fixing recursive fault but reboot is needed! This happens in: static void __fput(struct file *file) { ... if (file->f_op->release) file->f_op->release(inode, file); <<< cdev is kfree'd here if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL && !(mode & FMODE_PATH))) { cdev_put(inode->i_cdev); <<< cdev fields are accessed here Namely: __fput() posix_clock_release() kref_put(&clk->kref, delete_clock) <<< the last reference delete_clock() delete_ptp_clock() kfree(ptp) <<< cdev is embedded in ptp cdev_put module_put(p->owner) <<< *p is kfree'd, bang! Here cdev is embedded in posix_clock which is embedded in ptp_clock. The race happens because ptp_clock's lifetime is controlled by two refcounts: kref and cdev.kobj in posix_clock. This is wrong. Make ptp_clock's sysfs device a parent of cdev with cdev_device_add() created especially for such cases. This way the parent device with its ptp_clock is not released until all references to the cdev are released. This adds a requirement that an initialized but not exposed struct device should be provided to posix_clock_register() by a caller instead of a simple dev_t. This approach was adopted from the commit 72139dfa2464 ("watchdog: Fix the race between the release of watchdog_core_data and cdev"). See details of the implementation in the commit 233ed09d7fda ("chardev: add helper function to register char devs with a struct device"). Link: https://lore.kernel.org/linux-fsdevel/20191125125342.6189-1-vdronov@redhat.com/T/#u Analyzed-by: Stephen Johnston Analyzed-by: Vern Lovejoy Signed-off-by: Vladis Dronov Acked-by: Richard Cochran Signed-off-by: David S. Miller --- drivers/ptp/ptp_clock.c | 31 ++++++++++++++----------------- drivers/ptp/ptp_private.h | 2 +- include/linux/posix-clock.h | 19 +++++++++++-------- kernel/time/posix-clock.c | 31 +++++++++++++------------------ 4 files changed, 39 insertions(+), 44 deletions(-) diff --git a/drivers/ptp/ptp_clock.c b/drivers/ptp/ptp_clock.c index e60eab7f8a61..61fafe0374ce 100644 --- a/drivers/ptp/ptp_clock.c +++ b/drivers/ptp/ptp_clock.c @@ -166,9 +166,9 @@ static struct posix_clock_operations ptp_clock_ops = { .read = ptp_read, }; -static void delete_ptp_clock(struct posix_clock *pc) +static void ptp_clock_release(struct device *dev) { - struct ptp_clock *ptp = container_of(pc, struct ptp_clock, clock); + struct ptp_clock *ptp = container_of(dev, struct ptp_clock, dev); mutex_destroy(&ptp->tsevq_mux); mutex_destroy(&ptp->pincfg_mux); @@ -213,7 +213,6 @@ struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info, } ptp->clock.ops = ptp_clock_ops; - ptp->clock.release = delete_ptp_clock; ptp->info = info; ptp->devid = MKDEV(major, index); ptp->index = index; @@ -236,15 +235,6 @@ struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info, if (err) goto no_pin_groups; - /* Create a new device in our class. */ - ptp->dev = device_create_with_groups(ptp_class, parent, ptp->devid, - ptp, ptp->pin_attr_groups, - "ptp%d", ptp->index); - if (IS_ERR(ptp->dev)) { - err = PTR_ERR(ptp->dev); - goto no_device; - } - /* Register a new PPS source. */ if (info->pps) { struct pps_source_info pps; @@ -260,8 +250,18 @@ struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info, } } - /* Create a posix clock. */ - err = posix_clock_register(&ptp->clock, ptp->devid); + /* Initialize a new device of our class in our clock structure. */ + device_initialize(&ptp->dev); + ptp->dev.devt = ptp->devid; + ptp->dev.class = ptp_class; + ptp->dev.parent = parent; + ptp->dev.groups = ptp->pin_attr_groups; + ptp->dev.release = ptp_clock_release; + dev_set_drvdata(&ptp->dev, ptp); + dev_set_name(&ptp->dev, "ptp%d", ptp->index); + + /* Create a posix clock and link it to the device. */ + err = posix_clock_register(&ptp->clock, &ptp->dev); if (err) { pr_err("failed to create posix clock\n"); goto no_clock; @@ -273,8 +273,6 @@ struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info, if (ptp->pps_source) pps_unregister_source(ptp->pps_source); no_pps: - device_destroy(ptp_class, ptp->devid); -no_device: ptp_cleanup_pin_groups(ptp); no_pin_groups: if (ptp->kworker) @@ -304,7 +302,6 @@ int ptp_clock_unregister(struct ptp_clock *ptp) if (ptp->pps_source) pps_unregister_source(ptp->pps_source); - device_destroy(ptp_class, ptp->devid); ptp_cleanup_pin_groups(ptp); posix_clock_unregister(&ptp->clock); diff --git a/drivers/ptp/ptp_private.h b/drivers/ptp/ptp_private.h index 9171d42468fd..6b97155148f1 100644 --- a/drivers/ptp/ptp_private.h +++ b/drivers/ptp/ptp_private.h @@ -28,7 +28,7 @@ struct timestamp_event_queue { struct ptp_clock { struct posix_clock clock; - struct device *dev; + struct device dev; struct ptp_clock_info *info; dev_t devid; int index; /* index into clocks.map */ diff --git a/include/linux/posix-clock.h b/include/linux/posix-clock.h index fe6cfdcfbc26..468328b1e1dd 100644 --- a/include/linux/posix-clock.h +++ b/include/linux/posix-clock.h @@ -69,29 +69,32 @@ struct posix_clock_operations { * * @ops: Functional interface to the clock * @cdev: Character device instance for this clock - * @kref: Reference count. + * @dev: Pointer to the clock's device. * @rwsem: Protects the 'zombie' field from concurrent access. * @zombie: If 'zombie' is true, then the hardware has disappeared. - * @release: A function to free the structure when the reference count reaches - * zero. May be NULL if structure is statically allocated. * * Drivers should embed their struct posix_clock within a private * structure, obtaining a reference to it during callbacks using * container_of(). + * + * Drivers should supply an initialized but not exposed struct device + * to posix_clock_register(). It is used to manage lifetime of the + * driver's private structure. It's 'release' field should be set to + * a release function for this private structure. */ struct posix_clock { struct posix_clock_operations ops; struct cdev cdev; - struct kref kref; + struct device *dev; struct rw_semaphore rwsem; bool zombie; - void (*release)(struct posix_clock *clk); }; /** * posix_clock_register() - register a new clock - * @clk: Pointer to the clock. Caller must provide 'ops' and 'release' - * @devid: Allocated device id + * @clk: Pointer to the clock. Caller must provide 'ops' field + * @dev: Pointer to the initialized device. Caller must provide + * 'release' field * * A clock driver calls this function to register itself with the * clock device subsystem. If 'clk' points to dynamically allocated @@ -100,7 +103,7 @@ struct posix_clock { * * Returns zero on success, non-zero otherwise. */ -int posix_clock_register(struct posix_clock *clk, dev_t devid); +int posix_clock_register(struct posix_clock *clk, struct device *dev); /** * posix_clock_unregister() - unregister a clock diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index ec960bb939fd..200fb2d3be99 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -14,8 +14,6 @@ #include "posix-timers.h" -static void delete_clock(struct kref *kref); - /* * Returns NULL if the posix_clock instance attached to 'fp' is old and stale. */ @@ -125,7 +123,7 @@ static int posix_clock_open(struct inode *inode, struct file *fp) err = 0; if (!err) { - kref_get(&clk->kref); + get_device(clk->dev); fp->private_data = clk; } out: @@ -141,7 +139,7 @@ static int posix_clock_release(struct inode *inode, struct file *fp) if (clk->ops.release) err = clk->ops.release(clk); - kref_put(&clk->kref, delete_clock); + put_device(clk->dev); fp->private_data = NULL; @@ -161,38 +159,35 @@ static const struct file_operations posix_clock_file_operations = { #endif }; -int posix_clock_register(struct posix_clock *clk, dev_t devid) +int posix_clock_register(struct posix_clock *clk, struct device *dev) { int err; - kref_init(&clk->kref); init_rwsem(&clk->rwsem); cdev_init(&clk->cdev, &posix_clock_file_operations); + err = cdev_device_add(&clk->cdev, dev); + if (err) { + pr_err("%s unable to add device %d:%d\n", + dev_name(dev), MAJOR(dev->devt), MINOR(dev->devt)); + return err; + } clk->cdev.owner = clk->ops.owner; - err = cdev_add(&clk->cdev, devid, 1); + clk->dev = dev; - return err; + return 0; } EXPORT_SYMBOL_GPL(posix_clock_register); -static void delete_clock(struct kref *kref) -{ - struct posix_clock *clk = container_of(kref, struct posix_clock, kref); - - if (clk->release) - clk->release(clk); -} - void posix_clock_unregister(struct posix_clock *clk) { - cdev_del(&clk->cdev); + cdev_device_del(&clk->cdev, clk->dev); down_write(&clk->rwsem); clk->zombie = true; up_write(&clk->rwsem); - kref_put(&clk->kref, delete_clock); + put_device(clk->dev); } EXPORT_SYMBOL_GPL(posix_clock_unregister); From 853697504de043ff0bfd815bd3a64de1dce73dc7 Mon Sep 17 00:00:00 2001 From: Cambda Zhu Date: Fri, 27 Dec 2019 16:52:37 +0800 Subject: [PATCH 100/146] tcp: Fix highest_sack and highest_sack_seq >From commit 50895b9de1d3 ("tcp: highest_sack fix"), the logic about setting tp->highest_sack to the head of the send queue was removed. Of course the logic is error prone, but it is logical. Before we remove the pointer to the highest sack skb and use the seq instead, we need to set tp->highest_sack to NULL when there is no skb after the last sack, and then replace NULL with the real skb when new skb inserted into the rtx queue, because the NULL means the highest sack seq is tp->snd_nxt. If tp->highest_sack is NULL and new data sent, the next ACK with sack option will increase tp->reordering unexpectedly. This patch sets tp->highest_sack to the tail of the rtx queue if it's NULL and new data is sent. The patch keeps the rule that the highest_sack can only be maintained by sack processing, except for this only case. Fixes: 50895b9de1d3 ("tcp: highest_sack fix") Signed-off-by: Cambda Zhu Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 1f7735ca8f22..58c92a7d671c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -72,6 +72,9 @@ static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) __skb_unlink(skb, &sk->sk_write_queue); tcp_rbtree_insert(&sk->tcp_rtx_queue, skb); + if (tp->highest_sack == NULL) + tp->highest_sack = skb; + tp->packets_out += tcp_skb_pcount(skb); if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) tcp_rearm_rto(sk); From a5b72a083da197b493c7ed1e5730d62d3199f7d6 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Sat, 28 Dec 2019 16:36:58 +0100 Subject: [PATCH 101/146] net/sched: add delete_empty() to filters and use it in cls_flower Revert "net/sched: cls_u32: fix refcount leak in the error path of u32_change()", and fix the u32 refcount leak in a more generic way that preserves the semantic of rule dumping. On tc filters that don't support lockless insertion/removal, there is no need to guard against concurrent insertion when a removal is in progress. Therefore, for most of them we can avoid a full walk() when deleting, and just decrease the refcount, like it was done on older Linux kernels. This fixes situations where walk() was wrongly detecting a non-empty filter, like it happened with cls_u32 in the error path of change(), thus leading to failures in the following tdc selftests: 6aa7: (filter, u32) Add/Replace u32 with source match and invalid indev 6658: (filter, u32) Add/Replace u32 with custom hash table and invalid handle 74c2: (filter, u32) Add/Replace u32 filter with invalid hash table id On cls_flower, and on (future) lockless filters, this check is necessary: move all the check_empty() logic in a callback so that each filter can have its own implementation. For cls_flower, it's sufficient to check if no IDRs have been allocated. This reverts commit 275c44aa194b7159d1191817b20e076f55f0e620. Changes since v1: - document the need for delete_empty() when TCF_PROTO_OPS_DOIT_UNLOCKED is used, thanks to Vlad Buslov - implement delete_empty() without doing fl_walk(), thanks to Vlad Buslov - squash revert and new fix in a single patch, to be nice with bisect tests that run tdc on u32 filter, thanks to Dave Miller Fixes: 275c44aa194b ("net/sched: cls_u32: fix refcount leak in the error path of u32_change()") Fixes: 6676d5e416ee ("net: sched: set dedicated tcf_walker flag when tp is empty") Suggested-by: Jamal Hadi Salim Suggested-by: Vlad Buslov Signed-off-by: Davide Caratti Reviewed-by: Vlad Buslov Tested-by: Jamal Hadi Salim Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/net/sch_generic.h | 5 +++++ net/sched/cls_api.c | 31 +++++-------------------------- net/sched/cls_flower.c | 12 ++++++++++++ net/sched/cls_u32.c | 25 ------------------------- 4 files changed, 22 insertions(+), 51 deletions(-) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 144f264ea394..fceddf89592a 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -308,6 +308,7 @@ struct tcf_proto_ops { int (*delete)(struct tcf_proto *tp, void *arg, bool *last, bool rtnl_held, struct netlink_ext_ack *); + bool (*delete_empty)(struct tcf_proto *tp); void (*walk)(struct tcf_proto *tp, struct tcf_walker *arg, bool rtnl_held); int (*reoffload)(struct tcf_proto *tp, bool add, @@ -336,6 +337,10 @@ struct tcf_proto_ops { int flags; }; +/* Classifiers setting TCF_PROTO_OPS_DOIT_UNLOCKED in tcf_proto_ops->flags + * are expected to implement tcf_proto_ops->delete_empty(), otherwise race + * conditions can occur when filters are inserted/deleted simultaneously. + */ enum tcf_proto_ops_flags { TCF_PROTO_OPS_DOIT_UNLOCKED = 1, }; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 6a0eacafdb19..76e0d122616a 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -308,33 +308,12 @@ static void tcf_proto_put(struct tcf_proto *tp, bool rtnl_held, tcf_proto_destroy(tp, rtnl_held, true, extack); } -static int walker_check_empty(struct tcf_proto *tp, void *fh, - struct tcf_walker *arg) +static bool tcf_proto_check_delete(struct tcf_proto *tp) { - if (fh) { - arg->nonempty = true; - return -1; - } - return 0; -} + if (tp->ops->delete_empty) + return tp->ops->delete_empty(tp); -static bool tcf_proto_is_empty(struct tcf_proto *tp, bool rtnl_held) -{ - struct tcf_walker walker = { .fn = walker_check_empty, }; - - if (tp->ops->walk) { - tp->ops->walk(tp, &walker, rtnl_held); - return !walker.nonempty; - } - return true; -} - -static bool tcf_proto_check_delete(struct tcf_proto *tp, bool rtnl_held) -{ - spin_lock(&tp->lock); - if (tcf_proto_is_empty(tp, rtnl_held)) - tp->deleting = true; - spin_unlock(&tp->lock); + tp->deleting = true; return tp->deleting; } @@ -1751,7 +1730,7 @@ static void tcf_chain_tp_delete_empty(struct tcf_chain *chain, * concurrently. * Mark tp for deletion if it is empty. */ - if (!tp_iter || !tcf_proto_check_delete(tp, rtnl_held)) { + if (!tp_iter || !tcf_proto_check_delete(tp)) { mutex_unlock(&chain->filter_chain_lock); return; } diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 0d125de54285..b0f42e62dd76 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -2773,6 +2773,17 @@ static void fl_bind_class(void *fh, u32 classid, unsigned long cl) f->res.class = cl; } +static bool fl_delete_empty(struct tcf_proto *tp) +{ + struct cls_fl_head *head = fl_head_dereference(tp); + + spin_lock(&tp->lock); + tp->deleting = idr_is_empty(&head->handle_idr); + spin_unlock(&tp->lock); + + return tp->deleting; +} + static struct tcf_proto_ops cls_fl_ops __read_mostly = { .kind = "flower", .classify = fl_classify, @@ -2782,6 +2793,7 @@ static struct tcf_proto_ops cls_fl_ops __read_mostly = { .put = fl_put, .change = fl_change, .delete = fl_delete, + .delete_empty = fl_delete_empty, .walk = fl_walk, .reoffload = fl_reoffload, .hw_add = fl_hw_add, diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 66c6bcec16cb..a0e6fac613de 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -1108,33 +1108,10 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, return err; } -static bool u32_hnode_empty(struct tc_u_hnode *ht, bool *non_root_ht) -{ - int i; - - if (!ht) - return true; - if (!ht->is_root) { - *non_root_ht = true; - return false; - } - if (*non_root_ht) - return false; - if (ht->refcnt < 2) - return true; - - for (i = 0; i <= ht->divisor; i++) { - if (rtnl_dereference(ht->ht[i])) - return false; - } - return true; -} - static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg, bool rtnl_held) { struct tc_u_common *tp_c = tp->data; - bool non_root_ht = false; struct tc_u_hnode *ht; struct tc_u_knode *n; unsigned int h; @@ -1147,8 +1124,6 @@ static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg, ht = rtnl_dereference(ht->next)) { if (ht->prio != tp->prio) continue; - if (u32_hnode_empty(ht, &non_root_ht)) - return; if (arg->count >= arg->skip) { if (arg->fn(tp, ht, arg) < 0) { arg->stop = 1; From 04b69426d846cd04ca9acefff1ea39e1c64d2714 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sat, 28 Dec 2019 16:28:09 +0000 Subject: [PATCH 102/146] hsr: fix slab-out-of-bounds Read in hsr_debugfs_rename() hsr slave interfaces don't have debugfs directory. So, hsr_debugfs_rename() shouldn't be called when hsr slave interface name is changed. Test commands: ip link add dummy0 type dummy ip link add dummy1 type dummy ip link add hsr0 type hsr slave1 dummy0 slave2 dummy1 ip link set dummy0 name ap Splat looks like: [21071.899367][T22666] ap: renamed from dummy0 [21071.914005][T22666] ================================================================== [21071.919008][T22666] BUG: KASAN: slab-out-of-bounds in hsr_debugfs_rename+0xaa/0xb0 [hsr] [21071.923640][T22666] Read of size 8 at addr ffff88805febcd98 by task ip/22666 [21071.926941][T22666] [21071.927750][T22666] CPU: 0 PID: 22666 Comm: ip Not tainted 5.5.0-rc2+ #240 [21071.929919][T22666] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 [21071.935094][T22666] Call Trace: [21071.935867][T22666] dump_stack+0x96/0xdb [21071.936687][T22666] ? hsr_debugfs_rename+0xaa/0xb0 [hsr] [21071.937774][T22666] print_address_description.constprop.5+0x1be/0x360 [21071.939019][T22666] ? hsr_debugfs_rename+0xaa/0xb0 [hsr] [21071.940081][T22666] ? hsr_debugfs_rename+0xaa/0xb0 [hsr] [21071.940949][T22666] __kasan_report+0x12a/0x16f [21071.941758][T22666] ? hsr_debugfs_rename+0xaa/0xb0 [hsr] [21071.942674][T22666] kasan_report+0xe/0x20 [21071.943325][T22666] hsr_debugfs_rename+0xaa/0xb0 [hsr] [21071.944187][T22666] hsr_netdev_notify+0x1fe/0x9b0 [hsr] [21071.945052][T22666] ? __module_text_address+0x13/0x140 [21071.945897][T22666] notifier_call_chain+0x90/0x160 [21071.946743][T22666] dev_change_name+0x419/0x840 [21071.947496][T22666] ? __read_once_size_nocheck.constprop.6+0x10/0x10 [21071.948600][T22666] ? netdev_adjacent_rename_links+0x280/0x280 [21071.949577][T22666] ? __read_once_size_nocheck.constprop.6+0x10/0x10 [21071.950672][T22666] ? lock_downgrade+0x6e0/0x6e0 [21071.951345][T22666] ? do_setlink+0x811/0x2ef0 [21071.951991][T22666] do_setlink+0x811/0x2ef0 [21071.952613][T22666] ? is_bpf_text_address+0x81/0xe0 [ ... ] Reported-by: syzbot+9328206518f08318a5fd@syzkaller.appspotmail.com Fixes: 4c2d5e33dcd3 ("hsr: rename debugfs file when interface name is changed") Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- net/hsr/hsr_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c index d2ee7125a7f1..9e389accbfc7 100644 --- a/net/hsr/hsr_main.c +++ b/net/hsr/hsr_main.c @@ -46,7 +46,8 @@ static int hsr_netdev_notify(struct notifier_block *nb, unsigned long event, hsr_check_carrier_and_operstate(hsr); break; case NETDEV_CHANGENAME: - hsr_debugfs_rename(dev); + if (is_hsr_master(dev)) + hsr_debugfs_rename(dev); break; case NETDEV_CHANGEADDR: if (port->type == HSR_PT_MASTER) { From 969e11529221a6a2a787cb3b63ccf9402f8d2e37 Mon Sep 17 00:00:00 2001 From: Evan Quan Date: Mon, 23 Dec 2019 16:13:48 +0800 Subject: [PATCH 103/146] drm/amdgpu: correct RLC firmwares loading sequence Per confirmation with RLC firmware team, the RLC should be unhalted after all RLC related firmwares uploaded. However, in fact the RLC is unhalted immediately after RLCG firmware uploaded. And that may causes unexpected PSP hang on loading the succeeding RLC save restore list related firmwares. So, we correct the firmware loading sequence to load RLC save restore list related firmwares before RLCG ucode. That will help to get around this issue. Signed-off-by: Evan Quan Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index 44be3a45b25e..e1b8d8daeafc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -1488,7 +1488,7 @@ static int psp_np_fw_load(struct psp_context *psp) /* Start rlc autoload after psp recieved all the gfx firmware */ if (psp->autoload_supported && ucode->ucode_id == (amdgpu_sriov_vf(adev) ? - AMDGPU_UCODE_ID_CP_MEC2 : AMDGPU_UCODE_ID_RLC_RESTORE_LIST_SRM_MEM)) { + AMDGPU_UCODE_ID_CP_MEC2 : AMDGPU_UCODE_ID_RLC_G)) { ret = psp_rlc_autoload(psp); if (ret) { DRM_ERROR("Failed to start rlc autoload\n"); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h index 410587b950f3..914acecda5cf 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h @@ -292,10 +292,10 @@ enum AMDGPU_UCODE_ID { AMDGPU_UCODE_ID_CP_MEC2_JT, AMDGPU_UCODE_ID_CP_MES, AMDGPU_UCODE_ID_CP_MES_DATA, - AMDGPU_UCODE_ID_RLC_G, AMDGPU_UCODE_ID_RLC_RESTORE_LIST_CNTL, AMDGPU_UCODE_ID_RLC_RESTORE_LIST_GPM_MEM, AMDGPU_UCODE_ID_RLC_RESTORE_LIST_SRM_MEM, + AMDGPU_UCODE_ID_RLC_G, AMDGPU_UCODE_ID_STORAGE, AMDGPU_UCODE_ID_SMC, AMDGPU_UCODE_ID_UVD, From 9c95a278ba7ca3ccc111c165cc74cb23c744fc85 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Wed, 11 Dec 2019 12:44:08 +0100 Subject: [PATCH 104/146] apparmor: fix bind mounts aborting with -ENOMEM With commit df323337e507 ("apparmor: Use a memory pool instead per-CPU caches, 2019-05-03"), AppArmor code was converted to use memory pools. In that conversion, a bug snuck into the code that polices bind mounts that causes all bind mounts to fail with -ENOMEM, as we erroneously error out if `aa_get_buffer` returns a pointer instead of erroring out when it does _not_ return a valid pointer. Fix the issue by correctly checking for valid pointers returned by `aa_get_buffer` to fix bind mounts with AppArmor. Fixes: df323337e507 ("apparmor: Use a memory pool instead per-CPU caches") Signed-off-by: Patrick Steinhardt Signed-off-by: John Johansen --- security/apparmor/mount.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/apparmor/mount.c b/security/apparmor/mount.c index 4ed6688f9d40..e0828ee7a345 100644 --- a/security/apparmor/mount.c +++ b/security/apparmor/mount.c @@ -442,7 +442,7 @@ int aa_bind_mount(struct aa_label *label, const struct path *path, buffer = aa_get_buffer(false); old_buffer = aa_get_buffer(false); error = -ENOMEM; - if (!buffer || old_buffer) + if (!buffer || !old_buffer) goto out; error = fn_for_each_confined(label, profile, From 20d4e80d255dd7cfecb53743bc550ebcad04549d Mon Sep 17 00:00:00 2001 From: John Johansen Date: Wed, 18 Dec 2019 11:04:07 -0800 Subject: [PATCH 105/146] apparmor: only get a label reference if the fast path check fails MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The common fast path check can be done under rcu_read_lock() and doesn't need a reference count on the label. Only take a reference count if entering the slow path. Fixes reported hackbench regression - sha1 79e178a57dae ("Merge tag 'apparmor-pr-2019-12-03' of git://git.kernel.org/pub/scm/linux/kernel/git/jj/linux-apparmor") hackbench -l (256000/#grp) -g #grp 128 groups 19.679 ±0.90% - previous sha1 01d1dff64662 ("Merge tag 's390-5.5-2' of git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux") hackbench -l (256000/#grp) -g #grp 128 groups 3.1689 ±3.04% Reported-by: Vincent Guittot Tested-by: Vincent Guittot Tested-by: Sebastian Andrzej Siewior Fixes: bce4e7e9c45e ("apparmor: reduce rcu_read_lock scope for aa_file_perm mediation") Signed-off-by: John Johansen --- security/apparmor/file.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/security/apparmor/file.c b/security/apparmor/file.c index fe2ebe5e865e..f1caf3674e1c 100644 --- a/security/apparmor/file.c +++ b/security/apparmor/file.c @@ -618,8 +618,7 @@ int aa_file_perm(const char *op, struct aa_label *label, struct file *file, fctx = file_ctx(file); rcu_read_lock(); - flabel = aa_get_newest_label(rcu_dereference(fctx->label)); - rcu_read_unlock(); + flabel = rcu_dereference(fctx->label); AA_BUG(!flabel); /* revalidate access, if task is unconfined, or the cached cred @@ -631,9 +630,13 @@ int aa_file_perm(const char *op, struct aa_label *label, struct file *file, */ denied = request & ~fctx->allow; if (unconfined(label) || unconfined(flabel) || - (!denied && aa_label_is_subset(flabel, label))) + (!denied && aa_label_is_subset(flabel, label))) { + rcu_read_unlock(); goto done; + } + flabel = aa_get_newest_label(flabel); + rcu_read_unlock(); /* TODO: label cross check */ if (file->f_path.mnt && path_mediated_fs(file->f_path.dentry)) @@ -643,8 +646,9 @@ int aa_file_perm(const char *op, struct aa_label *label, struct file *file, else if (S_ISSOCK(file_inode(file)->i_mode)) error = __file_sock_perm(op, label, flabel, file, request, denied); -done: aa_put_label(flabel); + +done: return error; } From 8df955a32a73315055e0cd187cbb1cea5820394b Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 30 Dec 2019 11:48:10 -0800 Subject: [PATCH 106/146] pstore/ram: Fix error-path memory leak in persistent_ram_new() callers For callers that allocated a label for persistent_ram_new(), if the call fails, they must clean up the allocation. Suggested-by: Navid Emamdoost Fixes: 1227daa43bce ("pstore/ram: Clarify resource reservation labels") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/lkml/20191211191353.14385-1-navid.emamdoost@gmail.com Signed-off-by: Kees Cook --- fs/pstore/ram.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 8caff834f002..f753f3b6f88d 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -577,6 +577,7 @@ static int ramoops_init_przs(const char *name, dev_err(dev, "failed to request %s mem region (0x%zx@0x%llx): %d\n", name, record_size, (unsigned long long)*paddr, err); + kfree(label); while (i > 0) { i--; @@ -622,6 +623,7 @@ static int ramoops_init_prz(const char *name, dev_err(dev, "failed to request %s mem region (0x%zx@0x%llx): %d\n", name, sz, (unsigned long long)*paddr, err); + kfree(label); return err; } From 9e5f1c19800b808a37fb9815a26d382132c26c3d Mon Sep 17 00:00:00 2001 From: Aleksandr Yashkin Date: Mon, 23 Dec 2019 18:38:16 +0500 Subject: [PATCH 107/146] pstore/ram: Write new dumps to start of recycled zones The ram_core.c routines treat przs as circular buffers. When writing a new crash dump, the old buffer needs to be cleared so that the new dump doesn't end up in the wrong place (i.e. at the end). The solution to this problem is to reset the circular buffer state before writing a new Oops dump. Signed-off-by: Aleksandr Yashkin Signed-off-by: Nikolay Merinov Signed-off-by: Ariel Gilman Link: https://lore.kernel.org/r/20191223133816.28155-1-n.merinov@inango-systems.com Fixes: 896fc1f0c4c6 ("pstore/ram: Switch to persistent_ram routines") Cc: stable@vger.kernel.org Signed-off-by: Kees Cook --- fs/pstore/ram.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index f753f3b6f88d..487ee39b438a 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -407,6 +407,17 @@ static int notrace ramoops_pstore_write(struct pstore_record *record) prz = cxt->dprzs[cxt->dump_write_cnt]; + /* + * Since this is a new crash dump, we need to reset the buffer in + * case it still has an old dump present. Without this, the new dump + * will get appended, which would seriously confuse anything trying + * to check dump file contents. Specifically, ramoops_read_kmsg_hdr() + * expects to find a dump header in the beginning of buffer data, so + * we must to reset the buffer values, in order to ensure that the + * header will be written to the beginning of the buffer. + */ + persistent_ram_zap(prz); + /* Build header and append record contents. */ hlen = ramoops_write_kmsg_hdr(prz, record); if (!hlen) From 771b894f2f3dfedc2ba5561731fffa0e39b1bbb6 Mon Sep 17 00:00:00 2001 From: Sargun Dhillon Date: Mon, 30 Dec 2019 12:35:03 -0800 Subject: [PATCH 108/146] samples/seccomp: Zero out members based on seccomp_notif_sizes The sizes by which seccomp_notif and seccomp_notif_resp are allocated are based on the SECCOMP_GET_NOTIF_SIZES ioctl. This allows for graceful extension of these datastructures. If userspace zeroes out the datastructure based on its version, and it is lagging behind the kernel's version, it will end up sending trailing garbage. On the other hand, if it is ahead of the kernel version, it will write extra zero space, and potentially cause corruption. Signed-off-by: Sargun Dhillon Suggested-by: Tycho Andersen Link: https://lore.kernel.org/r/20191230203503.4925-1-sargun@sargun.me Fixes: fec7b6690541 ("samples: add an example of seccomp user trap") Cc: stable@vger.kernel.org Signed-off-by: Kees Cook --- samples/seccomp/user-trap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/samples/seccomp/user-trap.c b/samples/seccomp/user-trap.c index 6d0125ca8af7..20291ec6489f 100644 --- a/samples/seccomp/user-trap.c +++ b/samples/seccomp/user-trap.c @@ -298,14 +298,14 @@ int main(void) req = malloc(sizes.seccomp_notif); if (!req) goto out_close; - memset(req, 0, sizeof(*req)); resp = malloc(sizes.seccomp_notif_resp); if (!resp) goto out_req; - memset(resp, 0, sizeof(*resp)); + memset(resp, 0, sizes.seccomp_notif_resp); while (1) { + memset(req, 0, sizes.seccomp_notif); if (ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, req)) { perror("ioctl recv"); goto out_resp; From 88c13f8bd71472fbab5338b01d99122908c77e53 Mon Sep 17 00:00:00 2001 From: Sargun Dhillon Date: Sat, 28 Dec 2019 22:24:49 -0800 Subject: [PATCH 109/146] selftests/seccomp: Zero out seccomp_notif The seccomp_notif structure should be zeroed out prior to calling the SECCOMP_IOCTL_NOTIF_RECV ioctl. Previously, the kernel did not check whether these structures were zeroed out or not, so these worked. This patch zeroes out the seccomp_notif data structure prior to calling the ioctl. Signed-off-by: Sargun Dhillon Reviewed-by: Tycho Andersen Reviewed-by: Christian Brauner Link: https://lore.kernel.org/r/20191229062451.9467-1-sargun@sargun.me Fixes: 6a21cc50f0c7 ("seccomp: add a return code to trap to userspace") Cc: stable@vger.kernel.org Signed-off-by: Kees Cook --- tools/testing/selftests/seccomp/seccomp_bpf.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index 6944b898bb53..f53f14971bff 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -3278,6 +3278,7 @@ TEST(user_notification_signal) close(sk_pair[1]); + memset(&req, 0, sizeof(req)); EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0); EXPECT_EQ(kill(pid, SIGUSR1), 0); @@ -3296,6 +3297,7 @@ TEST(user_notification_signal) EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1); EXPECT_EQ(errno, ENOENT); + memset(&req, 0, sizeof(req)); EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0); resp.id = req.id; From 2882d53c9c6f3b8311d225062522f03772cf0179 Mon Sep 17 00:00:00 2001 From: Sargun Dhillon Date: Sat, 28 Dec 2019 22:24:50 -0800 Subject: [PATCH 110/146] seccomp: Check that seccomp_notif is zeroed out by the user This patch is a small change in enforcement of the uapi for SECCOMP_IOCTL_NOTIF_RECV ioctl. Specifically, the datastructure which is passed (seccomp_notif) must be zeroed out. Previously any of its members could be set to nonsense values, and we would ignore it. This ensures all fields are set to their zero value. Signed-off-by: Sargun Dhillon Reviewed-by: Christian Brauner Reviewed-by: Aleksa Sarai Acked-by: Tycho Andersen Link: https://lore.kernel.org/r/20191229062451.9467-2-sargun@sargun.me Fixes: 6a21cc50f0c7 ("seccomp: add a return code to trap to userspace") Cc: stable@vger.kernel.org Signed-off-by: Kees Cook --- kernel/seccomp.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 12d2227e5786..b6ea3dcb57bf 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -1026,6 +1026,13 @@ static long seccomp_notify_recv(struct seccomp_filter *filter, struct seccomp_notif unotif; ssize_t ret; + /* Verify that we're not given garbage to keep struct extensible. */ + ret = check_zeroed_user(buf, sizeof(unotif)); + if (ret < 0) + return ret; + if (!ret) + return -EINVAL; + memset(&unotif, 0, sizeof(unotif)); ret = down_interruptible(&filter->notif->request); From e4ab5ccc357b978999328fadae164e098c26fa40 Mon Sep 17 00:00:00 2001 From: Sargun Dhillon Date: Mon, 30 Dec 2019 12:38:11 -0800 Subject: [PATCH 111/146] selftests/seccomp: Catch garbage on SECCOMP_IOCTL_NOTIF_RECV This adds logic to the user_notification_basic test to set a member of struct seccomp_notif to an invalid value to ensure that the kernel returns EINVAL if any of the struct seccomp_notif members are set to invalid values. Signed-off-by: Sargun Dhillon Suggested-by: Christian Brauner Link: https://lore.kernel.org/r/20191230203811.4996-1-sargun@sargun.me Fixes: 6a21cc50f0c7 ("seccomp: add a return code to trap to userspace") Cc: stable@vger.kernel.org Signed-off-by: Kees Cook --- tools/testing/selftests/seccomp/seccomp_bpf.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index f53f14971bff..ee1b727ede04 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -3158,7 +3158,18 @@ TEST(user_notification_basic) EXPECT_GT(poll(&pollfd, 1, -1), 0); EXPECT_EQ(pollfd.revents, POLLIN); - EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0); + /* Test that we can't pass garbage to the kernel. */ + memset(&req, 0, sizeof(req)); + req.pid = -1; + errno = 0; + ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + if (ret) { + req.pid = 0; + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0); + } pollfd.fd = listener; pollfd.events = POLLIN | POLLOUT; From a5b0dc5a46c221725c43bd9b01570239a4cd78b1 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 11 Dec 2019 14:39:28 +0100 Subject: [PATCH 112/146] gcc-plugins: make it possible to disable CONFIG_GCC_PLUGINS again I noticed that randconfig builds with gcc no longer produce a lot of ccache hits, unlike with clang, and traced this back to plugins now being enabled unconditionally if they are supported. I am now working around this by adding export CCACHE_COMPILERCHECK=/usr/bin/size -A %compiler% to my top-level Makefile. This changes the heuristic that ccache uses to determine whether the plugins are the same after a 'make clean'. However, it also seems that being able to just turn off the plugins is generally useful, at least for build testing it adds noticeable overhead but does not find a lot of bugs additional bugs, and may be easier for ccache users than my workaround. Fixes: 9f671e58159a ("security: Create "kernel hardening" config area") Signed-off-by: Arnd Bergmann Acked-by: Ard Biesheuvel Reviewed-by: Masahiro Yamada Link: https://lore.kernel.org/r/20191211133951.401933-1-arnd@arndb.de Cc: stable@vger.kernel.org Signed-off-by: Kees Cook --- scripts/gcc-plugins/Kconfig | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/scripts/gcc-plugins/Kconfig b/scripts/gcc-plugins/Kconfig index d33de0b9f4f5..e3569543bdac 100644 --- a/scripts/gcc-plugins/Kconfig +++ b/scripts/gcc-plugins/Kconfig @@ -14,8 +14,8 @@ config HAVE_GCC_PLUGINS An arch should select this symbol if it supports building with GCC plugins. -config GCC_PLUGINS - bool +menuconfig GCC_PLUGINS + bool "GCC plugins" depends on HAVE_GCC_PLUGINS depends on PLUGIN_HOSTCC != "" default y @@ -25,8 +25,7 @@ config GCC_PLUGINS See Documentation/core-api/gcc-plugins.rst for details. -menu "GCC plugins" - depends on GCC_PLUGINS +if GCC_PLUGINS config GCC_PLUGIN_CYC_COMPLEXITY bool "Compute the cyclomatic complexity of a function" if EXPERT @@ -113,4 +112,4 @@ config GCC_PLUGIN_ARM_SSP_PER_TASK bool depends on GCC_PLUGINS && ARM -endmenu +endif From 74f1a299107b9e1a563831a4ba85f769ab577164 Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Wed, 1 Jan 2020 20:05:03 +0100 Subject: [PATCH 113/146] Revert "fs: remove ksys_dup()" This reverts commit 8243186f0cc7 ("fs: remove ksys_dup()") and the subsequent fix for it in commit 2d3145f8d280 ("early init: fix error handling when opening /dev/console"). Trying to use filp_open() and f_dupfd() instead of pseudo-syscalls caused more trouble than what is worth it: it requires accessing vfs internals and it turns out there were other bugs in it too. In particular, the file reference counting was wrong - because unlike the original "open+2*dup" sequence it used "filp_open+3*f_dupfd" and thus had an extra leaked file reference. That in turn then caused odd problems with Androidx86 long after boot becaue of how the extra reference to the console kept the session active even after all file descriptors had been closed. Reported-by: youling 257 Cc: Arvind Sankar Cc: Al Viro Signed-off-by: Dominik Brodowski Signed-off-by: Linus Torvalds --- fs/file.c | 7 ++++++- include/linux/syscalls.h | 1 + init/main.c | 26 ++++++-------------------- 3 files changed, 13 insertions(+), 21 deletions(-) diff --git a/fs/file.c b/fs/file.c index 2f4fcf985079..3da91a112bab 100644 --- a/fs/file.c +++ b/fs/file.c @@ -960,7 +960,7 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd) return ksys_dup3(oldfd, newfd, 0); } -SYSCALL_DEFINE1(dup, unsigned int, fildes) +int ksys_dup(unsigned int fildes) { int ret = -EBADF; struct file *file = fget_raw(fildes); @@ -975,6 +975,11 @@ SYSCALL_DEFINE1(dup, unsigned int, fildes) return ret; } +SYSCALL_DEFINE1(dup, unsigned int, fildes) +{ + return ksys_dup(fildes); +} + int f_dupfd(unsigned int from, struct file *file, unsigned flags) { int err; diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 2960dedcfde8..5262b7a76d39 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -1232,6 +1232,7 @@ asmlinkage long sys_ni_syscall(void); */ int ksys_umount(char __user *name, int flags); +int ksys_dup(unsigned int fildes); int ksys_chroot(const char __user *filename); ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count); int ksys_chdir(const char __user *filename); diff --git a/init/main.c b/init/main.c index 1ecfd43ed464..2cd736059416 100644 --- a/init/main.c +++ b/init/main.c @@ -93,7 +93,6 @@ #include #include #include -#include #include #include @@ -1158,26 +1157,13 @@ static int __ref kernel_init(void *unused) void console_on_rootfs(void) { - struct file *file; - unsigned int i; + /* Open the /dev/console as stdin, this should never fail */ + if (ksys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0) + pr_err("Warning: unable to open an initial console.\n"); - /* Open /dev/console in kernelspace, this should never fail */ - file = filp_open("/dev/console", O_RDWR, 0); - if (IS_ERR(file)) - goto err_out; - - /* create stdin/stdout/stderr, this should never fail */ - for (i = 0; i < 3; i++) { - if (f_dupfd(i, file, 0) != i) - goto err_out; - } - - return; - -err_out: - /* no panic -- this might not be fatal */ - pr_err("Warning: unable to open an initial console.\n"); - return; + /* create stdout/stderr */ + (void) ksys_dup(0); + (void) ksys_dup(0); } static noinline void __init kernel_init_freeable(void) From bbcc5672b0063b0e9d65dc8787a4f09c3b5bb5cc Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Wed, 1 Jan 2020 20:50:38 -0800 Subject: [PATCH 114/146] MIPS: Avoid VDSO ABI breakage due to global register variable Declaring __current_thread_info as a global register variable has the effect of preventing GCC from saving & restoring its value in cases where the ABI would typically do so. To quote GCC documentation: > If the register is a call-saved register, call ABI is affected: the > register will not be restored in function epilogue sequences after the > variable has been assigned. Therefore, functions cannot safely return > to callers that assume standard ABI. When our position independent VDSO is built for the n32 or n64 ABIs all functions it exposes should be preserving the value of $gp/$28 for their caller, but in the presence of the __current_thread_info global register variable GCC stops doing so & simply clobbers $gp/$28 when calculating the address of the GOT. In cases where the VDSO returns success this problem will typically be masked by the caller in libc returning & restoring $gp/$28 itself, but that is by no means guaranteed. In cases where the VDSO returns an error libc will typically contain a fallback path which will now fail (typically with a bad memory access) if it attempts anything which relies upon the value of $gp/$28 - eg. accessing anything via the GOT. One fix for this would be to move the declaration of __current_thread_info inside the current_thread_info() function, demoting it from global register variable to local register variable & avoiding inadvertently creating a non-standard calling ABI for the VDSO. Unfortunately this causes issues for clang, which doesn't support local register variables as pointed out by commit fe92da0f355e ("MIPS: Changed current_thread_info() to an equivalent supported by both clang and GCC") which introduced the global register variable before we had a VDSO to worry about. Instead, fix this by continuing to use the global register variable for the kernel proper but declare __current_thread_info as a simple extern variable when building the VDSO. It should never be referenced, and will cause a link error if it is. This resolves the calling convention issue for the VDSO without having any impact upon the build of the kernel itself for either clang or gcc. Signed-off-by: Paul Burton Fixes: ebb5e78cc634 ("MIPS: Initial implementation of a VDSO") Reported-by: Jason A. Donenfeld Reviewed-by: Jason A. Donenfeld Tested-by: Jason A. Donenfeld Cc: Arnd Bergmann Cc: Christian Brauner Cc: Vincenzo Frascino Cc: # v4.4+ Cc: linux-mips@vger.kernel.org Cc: linux-kernel@vger.kernel.org --- arch/mips/include/asm/thread_info.h | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/arch/mips/include/asm/thread_info.h b/arch/mips/include/asm/thread_info.h index 4993db40482c..ee26f9a4575d 100644 --- a/arch/mips/include/asm/thread_info.h +++ b/arch/mips/include/asm/thread_info.h @@ -49,8 +49,26 @@ struct thread_info { .addr_limit = KERNEL_DS, \ } -/* How to get the thread information struct from C. */ +/* + * A pointer to the struct thread_info for the currently executing thread is + * held in register $28/$gp. + * + * We declare __current_thread_info as a global register variable rather than a + * local register variable within current_thread_info() because clang doesn't + * support explicit local register variables. + * + * When building the VDSO we take care not to declare the global register + * variable because this causes GCC to not preserve the value of $28/$gp in + * functions that change its value (which is common in the PIC VDSO when + * accessing the GOT). Since the VDSO shouldn't be accessing + * __current_thread_info anyway we declare it extern in order to cause a link + * failure if it's referenced. + */ +#ifdef __VDSO__ +extern struct thread_info *__current_thread_info; +#else register struct thread_info *__current_thread_info __asm__("$28"); +#endif static inline struct thread_info *current_thread_info(void) { From 2fec966f593efd076ad1e56d274611cea7d29eec Mon Sep 17 00:00:00 2001 From: Yunfeng Ye Date: Tue, 17 Dec 2019 20:21:37 +0800 Subject: [PATCH 115/146] agp: remove unused variable mcapndx MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch fix the following warning: drivers/char/agp/isoch.c: In function ‘agp_3_5_isochronous_node_enable’: drivers/char/agp/isoch.c:87:5: warning: variable ‘mcapndx’ set but not used [-Wunused-but-set-variable] u8 mcapndx; ^~~~~~~ Signed-off-by: Yunfeng Ye Signed-off-by: Dave Airlie --- drivers/char/agp/isoch.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/char/agp/isoch.c b/drivers/char/agp/isoch.c index 31c374b1b91b..324992439ee8 100644 --- a/drivers/char/agp/isoch.c +++ b/drivers/char/agp/isoch.c @@ -84,7 +84,6 @@ static int agp_3_5_isochronous_node_enable(struct agp_bridge_data *bridge, unsigned int cdev = 0; u32 mnistat, tnistat, tstatus, mcmd; u16 tnicmd, mnicmd; - u8 mcapndx; u32 tot_bw = 0, tot_n = 0, tot_rq = 0, y_max, rq_isoch, rq_async; u32 step, rem, rem_isoch, rem_async; int ret = 0; @@ -138,8 +137,6 @@ static int agp_3_5_isochronous_node_enable(struct agp_bridge_data *bridge, cur = list_entry(pos, struct agp_3_5_dev, list); dev = cur->dev; - mcapndx = cur->capndx; - pci_read_config_dword(dev, cur->capndx+AGPNISTAT, &mnistat); master[cdev].maxbw = (mnistat >> 16) & 0xff; @@ -251,8 +248,6 @@ static int agp_3_5_isochronous_node_enable(struct agp_bridge_data *bridge, cur = master[cdev].dev; dev = cur->dev; - mcapndx = cur->capndx; - master[cdev].rq += (cdev == ndevs - 1) ? (rem_async + rem_isoch) : step; From a6204fc7b83cbe3398f61cf1742b09f66f0ae220 Mon Sep 17 00:00:00 2001 From: Yunfeng Ye Date: Tue, 17 Dec 2019 20:22:57 +0800 Subject: [PATCH 116/146] agp: remove unused variable arqsz in agp_3_5_enable() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch fix the following warning: drivers/char/agp/isoch.c: In function ‘agp_3_5_enable’: drivers/char/agp/isoch.c:322:13: warning: variable ‘arqsz’ set but not used [-Wunused-but-set-variable] u32 isoch, arqsz; ^~~~~ Signed-off-by: Yunfeng Ye Signed-off-by: Dave Airlie --- drivers/char/agp/isoch.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/char/agp/isoch.c b/drivers/char/agp/isoch.c index 324992439ee8..7ecf20a6d19c 100644 --- a/drivers/char/agp/isoch.c +++ b/drivers/char/agp/isoch.c @@ -314,7 +314,7 @@ int agp_3_5_enable(struct agp_bridge_data *bridge) { struct pci_dev *td = bridge->dev, *dev = NULL; u8 mcapndx; - u32 isoch, arqsz; + u32 isoch; u32 tstatus, mstatus, ncapid; u32 mmajor; u16 mpstat; @@ -329,8 +329,6 @@ int agp_3_5_enable(struct agp_bridge_data *bridge) if (isoch == 0) /* isoch xfers not available, bail out. */ return -ENODEV; - arqsz = (tstatus >> 13) & 0x7; - /* * Allocate a head for our AGP 3.5 device list * (multiple AGP v3 devices are allowed behind a single bridge). From ac51e005fe1456a288929a41d71adc6224e912d2 Mon Sep 17 00:00:00 2001 From: Zong Li Date: Thu, 2 Jan 2020 11:12:40 +0800 Subject: [PATCH 117/146] riscv: mm: use __pa_symbol for kernel symbols __pa_symbol is the marcro that should be used for kernel symbols. It is also a pre-requisite for DEBUG_VIRTUAL which will do bounds checking. Signed-off-by: Zong Li Reviewed-by: Anup Patel Signed-off-by: Paul Walmsley --- arch/riscv/mm/init.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 69f6678db7f3..965a8cf4829c 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -99,13 +99,13 @@ static void __init setup_initrd(void) pr_info("initrd not found or empty"); goto disable; } - if (__pa(initrd_end) > PFN_PHYS(max_low_pfn)) { + if (__pa_symbol(initrd_end) > PFN_PHYS(max_low_pfn)) { pr_err("initrd extends beyond end of memory"); goto disable; } size = initrd_end - initrd_start; - memblock_reserve(__pa(initrd_start), size); + memblock_reserve(__pa_symbol(initrd_start), size); initrd_below_start_ok = 1; pr_info("Initial ramdisk at: 0x%p (%lu bytes)\n", @@ -124,8 +124,8 @@ void __init setup_bootmem(void) { struct memblock_region *reg; phys_addr_t mem_size = 0; - phys_addr_t vmlinux_end = __pa(&_end); - phys_addr_t vmlinux_start = __pa(&_start); + phys_addr_t vmlinux_end = __pa_symbol(&_end); + phys_addr_t vmlinux_start = __pa_symbol(&_start); /* Find the memory region containing the kernel */ for_each_memblock(memory, reg) { @@ -445,7 +445,7 @@ static void __init setup_vm_final(void) /* Setup swapper PGD for fixmap */ create_pgd_mapping(swapper_pg_dir, FIXADDR_START, - __pa(fixmap_pgd_next), + __pa_symbol(fixmap_pgd_next), PGDIR_SIZE, PAGE_TABLE); /* Map all memory banks */ @@ -474,7 +474,7 @@ static void __init setup_vm_final(void) clear_fixmap(FIX_PMD); /* Move to swapper page table */ - csr_write(CSR_SATP, PFN_DOWN(__pa(swapper_pg_dir)) | SATP_MODE); + csr_write(CSR_SATP, PFN_DOWN(__pa_symbol(swapper_pg_dir)) | SATP_MODE); local_flush_tlb_all(); } #else From 0da310e82d3a9bff6ef6b0f2fbf45d1a05cc64fe Mon Sep 17 00:00:00 2001 From: Zong Li Date: Thu, 2 Jan 2020 11:09:54 +0800 Subject: [PATCH 118/146] riscv: gcov: enable gcov for RISC-V This patch enables GCOV code coverage measurement on RISC-V. Lightly tested on QEMU and Hifive Unleashed board, seems to work as expected. Signed-off-by: Zong Li Reviewed-by: Anup Patel Acked-by: Jonathan Corbet Signed-off-by: Paul Walmsley --- Documentation/features/debug/gcov-profile-all/arch-support.txt | 2 +- arch/riscv/Kconfig | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/Documentation/features/debug/gcov-profile-all/arch-support.txt b/Documentation/features/debug/gcov-profile-all/arch-support.txt index 059d58a549c7..6fb2b0671994 100644 --- a/Documentation/features/debug/gcov-profile-all/arch-support.txt +++ b/Documentation/features/debug/gcov-profile-all/arch-support.txt @@ -23,7 +23,7 @@ | openrisc: | TODO | | parisc: | TODO | | powerpc: | ok | - | riscv: | TODO | + | riscv: | ok | | s390: | ok | | sh: | ok | | sparc: | TODO | diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index d8efbaa78d67..a31169b02ec0 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -64,6 +64,7 @@ config RISCV select SPARSEMEM_STATIC if 32BIT select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU select HAVE_ARCH_MMAP_RND_BITS if MMU + select ARCH_HAS_GCOV_PROFILE_ALL config ARCH_MMAP_RND_BITS_MIN default 18 if 64BIT From cfda8617e22a8bf217a613d0b3ba3a38778443ba Mon Sep 17 00:00:00 2001 From: Yash Shah Date: Fri, 3 Jan 2020 09:43:20 +0530 Subject: [PATCH 119/146] riscv: dts: Add DT support for SiFive L2 cache controller Add the L2 cache controller DT node in SiFive FU540 soc-specific DT file Signed-off-by: Yash Shah Reviewed-by: Palmer Dabbelt Signed-off-by: Paul Walmsley --- arch/riscv/boot/dts/sifive/fu540-c000.dtsi | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/arch/riscv/boot/dts/sifive/fu540-c000.dtsi b/arch/riscv/boot/dts/sifive/fu540-c000.dtsi index 70a1891e7cd0..a2e3d54e830c 100644 --- a/arch/riscv/boot/dts/sifive/fu540-c000.dtsi +++ b/arch/riscv/boot/dts/sifive/fu540-c000.dtsi @@ -54,6 +54,7 @@ cpu1: cpu@1 { reg = <1>; riscv,isa = "rv64imafdc"; tlb-split; + next-level-cache = <&l2cache>; cpu1_intc: interrupt-controller { #interrupt-cells = <1>; compatible = "riscv,cpu-intc"; @@ -77,6 +78,7 @@ cpu2: cpu@2 { reg = <2>; riscv,isa = "rv64imafdc"; tlb-split; + next-level-cache = <&l2cache>; cpu2_intc: interrupt-controller { #interrupt-cells = <1>; compatible = "riscv,cpu-intc"; @@ -100,6 +102,7 @@ cpu3: cpu@3 { reg = <3>; riscv,isa = "rv64imafdc"; tlb-split; + next-level-cache = <&l2cache>; cpu3_intc: interrupt-controller { #interrupt-cells = <1>; compatible = "riscv,cpu-intc"; @@ -123,6 +126,7 @@ cpu4: cpu@4 { reg = <4>; riscv,isa = "rv64imafdc"; tlb-split; + next-level-cache = <&l2cache>; cpu4_intc: interrupt-controller { #interrupt-cells = <1>; compatible = "riscv,cpu-intc"; @@ -253,6 +257,17 @@ pwm1: pwm@10021000 { #pwm-cells = <3>; status = "disabled"; }; + l2cache: cache-controller@2010000 { + compatible = "sifive,fu540-c000-ccache", "cache"; + cache-block-size = <64>; + cache-level = <2>; + cache-sets = <1024>; + cache-size = <2097152>; + cache-unified; + interrupt-parent = <&plic0>; + interrupts = <1 2 3>; + reg = <0x0 0x2010000 0x0 0x1000>; + }; }; }; From 1d8f65798240b6577d8c44d20c8ea8f1d429e495 Mon Sep 17 00:00:00 2001 From: Zong Li Date: Mon, 23 Dec 2019 16:46:13 +0800 Subject: [PATCH 120/146] riscv: ftrace: correct the condition logic in function graph tracer The condition should be logical NOT to assign the hook address to parent address. Because the return value 0 of function_graph_enter upon success. Fixes: e949b6db51dc (riscv/function_graph: Simplify with function_graph_enter()) Signed-off-by: Zong Li Reviewed-by: Steven Rostedt (VMware) Cc: stable@vger.kernel.org Signed-off-by: Paul Walmsley --- arch/riscv/kernel/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/kernel/ftrace.c b/arch/riscv/kernel/ftrace.c index b94d8db5ddcc..c40fdcdeb950 100644 --- a/arch/riscv/kernel/ftrace.c +++ b/arch/riscv/kernel/ftrace.c @@ -142,7 +142,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, */ old = *parent; - if (function_graph_enter(old, self_addr, frame_pointer, parent)) + if (!function_graph_enter(old, self_addr, frame_pointer, parent)) *parent = return_hooker; } From ce644cf3fa06504c2c71ab1b794160d54aaccbc0 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Wed, 6 Nov 2019 12:57:07 +0100 Subject: [PATCH 121/146] media: intel-ipu3: Align struct ipu3_uapi_awb_fr_config_s to 32 bytes A struct that needs to be aligned to 32 bytes has a size of 28. Increase the size to 32. This makes elements of arrays of this struct aligned to 32 as well, and other structs where members are aligned to 32 mixing ipu3_uapi_awb_fr_config_s as well as other types. Fixes: commit dca5ef2aa1e6 ("media: staging/intel-ipu3: remove the unnecessary compiler flags") Signed-off-by: Sakari Ailus Tested-by: Bingbu Cao Signed-off-by: Mauro Carvalho Chehab --- drivers/staging/media/ipu3/include/intel-ipu3.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/staging/media/ipu3/include/intel-ipu3.h b/drivers/staging/media/ipu3/include/intel-ipu3.h index 08eaa0bad0de..1c9c3ba4d518 100644 --- a/drivers/staging/media/ipu3/include/intel-ipu3.h +++ b/drivers/staging/media/ipu3/include/intel-ipu3.h @@ -449,7 +449,7 @@ struct ipu3_uapi_awb_fr_config_s { __u16 reserved1; __u32 bayer_sign; __u8 bayer_nf; - __u8 reserved2[3]; + __u8 reserved2[7]; } __attribute__((aligned(32))) __packed; /** From cc976614f59bd8e45de8ce988a6bcb5de711d994 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 30 Dec 2019 22:20:06 +0900 Subject: [PATCH 122/146] gen_initramfs_list.sh: fix 'bad variable name' error Prior to commit 858805b336be ("kbuild: add $(BASH) to run scripts with bash-extension"), this shell script was almost always run by bash since bash is usually installed on the system by default. Now, this script is run by sh, which might be a symlink to dash. On such distributions, the following code emits an error: local dev=`LC_ALL=C ls -l "${location}"` You can reproduce the build error, for example by setting CONFIG_INITRAMFS_SOURCE="/dev". GEN usr/initramfs_data.cpio.gz ./usr/gen_initramfs_list.sh: 131: local: 1: bad variable name make[1]: *** [usr/Makefile:61: usr/initramfs_data.cpio.gz] Error 2 This is because `LC_ALL=C ls -l "${location}"` contains spaces. Surrounding it with double-quotes fixes the error. Fixes: 858805b336be ("kbuild: add $(BASH) to run scripts with bash-extension") Reported-by: Jory A. Pratt Signed-off-by: Masahiro Yamada --- usr/gen_initramfs_list.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/usr/gen_initramfs_list.sh b/usr/gen_initramfs_list.sh index 0aad760fcd8c..2bbac73e6477 100755 --- a/usr/gen_initramfs_list.sh +++ b/usr/gen_initramfs_list.sh @@ -128,7 +128,7 @@ parse() { str="${ftype} ${name} ${location} ${str}" ;; "nod") - local dev=`LC_ALL=C ls -l "${location}"` + local dev="`LC_ALL=C ls -l "${location}"`" local maj=`field 5 ${dev}` local min=`field 6 ${dev}` maj=${maj%,} From 8ffdc54b6f4cd718a45802e645bb853e3a46a078 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 30 Dec 2019 15:07:47 +0100 Subject: [PATCH 123/146] kbuild/deb-pkg: annotate libelf-dev dependency as :native Cross compiling the x86 kernel on a non-x86 build machine produces the following error when CONFIG_UNWINDER_ORC is enabled, regardless of whether libelf-dev is installed or not. dpkg-checkbuilddeps: error: Unmet build dependencies: libelf-dev dpkg-buildpackage: warning: build dependencies/conflicts unsatisfied; aborting dpkg-buildpackage: warning: (Use -d flag to override.) Since this is a build time dependency for a build tool, we need to depend on the native version of libelf-dev so add the appropriate annotation. Signed-off-by: Ard Biesheuvel Signed-off-by: Masahiro Yamada --- scripts/package/mkdebian | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/package/mkdebian b/scripts/package/mkdebian index 7c230016b08d..357dc56bcf30 100755 --- a/scripts/package/mkdebian +++ b/scripts/package/mkdebian @@ -136,7 +136,7 @@ mkdir -p debian/source/ echo "1.0" > debian/source/format echo $debarch > debian/arch -extra_build_depends=", $(if_enabled_echo CONFIG_UNWINDER_ORC libelf-dev)" +extra_build_depends=", $(if_enabled_echo CONFIG_UNWINDER_ORC libelf-dev:native)" extra_build_depends="$extra_build_depends, $(if_enabled_echo CONFIG_SYSTEM_TRUSTED_KEYRING libssl-dev:native)" # Generate a simple changelog template From 15f0ec941f4f908fefa23a30ded8358977cc1cc0 Mon Sep 17 00:00:00 2001 From: Jan Stancek Date: Fri, 3 Jan 2020 18:37:18 +0100 Subject: [PATCH 124/146] mm/hugetlbfs: fix for_each_hstate() loop in init_hugetlbfs_fs() LTP memfd_create04 started failing for some huge page sizes after v5.4-10135-gc3bfc5dd73c6. The problem is the check introduced to for_each_hstate() loop that should skip default_hstate_idx. Since it doesn't update 'i' counter, all subsequent huge page sizes are skipped as well. Fixes: 8fc312b32b25 ("mm/hugetlbfs: fix error handling when setting up mounts") Signed-off-by: Jan Stancek Reviewed-by: Mike Kravetz Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index d5c2a3158610..a66e425884d1 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1498,8 +1498,10 @@ static int __init init_hugetlbfs_fs(void) /* other hstates are optional */ i = 0; for_each_hstate(h) { - if (i == default_hstate_idx) + if (i == default_hstate_idx) { + i++; continue; + } mnt = mount_one_hugetlbfs(h); if (IS_ERR(mnt)) From feee6b2989165631b17ac6d4ccdbf6759254e85a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Sat, 4 Jan 2020 12:59:33 -0800 Subject: [PATCH 125/146] mm/memory_hotplug: shrink zones when offlining memory We currently try to shrink a single zone when removing memory. We use the zone of the first page of the memory we are removing. If that memmap was never initialized (e.g., memory was never onlined), we will read garbage and can trigger kernel BUGs (due to a stale pointer): BUG: unable to handle page fault for address: 000000000000353d #PF: supervisor write access in kernel mode #PF: error_code(0x0002) - not-present page PGD 0 P4D 0 Oops: 0002 [#1] SMP PTI CPU: 1 PID: 7 Comm: kworker/u8:0 Not tainted 5.3.0-rc5-next-20190820+ #317 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.4 Workqueue: kacpi_hotplug acpi_hotplug_work_fn RIP: 0010:clear_zone_contiguous+0x5/0x10 Code: 48 89 c6 48 89 c3 e8 2a fe ff ff 48 85 c0 75 cf 5b 5d c3 c6 85 fd 05 00 00 01 5b 5d c3 0f 1f 840 RSP: 0018:ffffad2400043c98 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 0000000200000000 RCX: 0000000000000000 RDX: 0000000000200000 RSI: 0000000000140000 RDI: 0000000000002f40 RBP: 0000000140000000 R08: 0000000000000000 R09: 0000000000000001 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000140000 R13: 0000000000140000 R14: 0000000000002f40 R15: ffff9e3e7aff3680 FS: 0000000000000000(0000) GS:ffff9e3e7bb00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000000000353d CR3: 0000000058610000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __remove_pages+0x4b/0x640 arch_remove_memory+0x63/0x8d try_remove_memory+0xdb/0x130 __remove_memory+0xa/0x11 acpi_memory_device_remove+0x70/0x100 acpi_bus_trim+0x55/0x90 acpi_device_hotplug+0x227/0x3a0 acpi_hotplug_work_fn+0x1a/0x30 process_one_work+0x221/0x550 worker_thread+0x50/0x3b0 kthread+0x105/0x140 ret_from_fork+0x3a/0x50 Modules linked in: CR2: 000000000000353d Instead, shrink the zones when offlining memory or when onlining failed. Introduce and use remove_pfn_range_from_zone(() for that. We now properly shrink the zones, even if we have DIMMs whereby - Some memory blocks fall into no zone (never onlined) - Some memory blocks fall into multiple zones (offlined+re-onlined) - Multiple memory blocks that fall into different zones Drop the zone parameter (with a potential dubious value) from __remove_pages() and __remove_section(). Link: http://lkml.kernel.org/r/20191006085646.5768-6-david@redhat.com Fixes: f1dd2cd13c4b ("mm, memory_hotplug: do not associate hotadded memory to zones until online") [visible after d0dc12e86b319] Signed-off-by: David Hildenbrand Reviewed-by: Oscar Salvador Cc: Michal Hocko Cc: "Matthew Wilcox (Oracle)" Cc: "Aneesh Kumar K.V" Cc: Pavel Tatashin Cc: Greg Kroah-Hartman Cc: Dan Williams Cc: Logan Gunthorpe Cc: [5.0+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/mm/mmu.c | 4 +--- arch/ia64/mm/init.c | 4 +--- arch/powerpc/mm/mem.c | 3 +-- arch/s390/mm/init.c | 4 +--- arch/sh/mm/init.c | 4 +--- arch/x86/mm/init_32.c | 4 +--- arch/x86/mm/init_64.c | 4 +--- include/linux/memory_hotplug.h | 7 +++++-- mm/memory_hotplug.c | 31 ++++++++++++++++--------------- mm/memremap.c | 2 +- 10 files changed, 29 insertions(+), 38 deletions(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 5a3b15a14a7f..40797cbfba2d 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1070,7 +1070,6 @@ void arch_remove_memory(int nid, u64 start, u64 size, { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - struct zone *zone; /* * FIXME: Cleanup page tables (also in arch_add_memory() in case @@ -1079,7 +1078,6 @@ void arch_remove_memory(int nid, u64 start, u64 size, * unplug. ARCH_ENABLE_MEMORY_HOTREMOVE must not be * unlocked yet. */ - zone = page_zone(pfn_to_page(start_pfn)); - __remove_pages(zone, start_pfn, nr_pages, altmap); + __remove_pages(start_pfn, nr_pages, altmap); } #endif diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 58fd67068bac..b01d68a2d5d9 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -689,9 +689,7 @@ void arch_remove_memory(int nid, u64 start, u64 size, { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - struct zone *zone; - zone = page_zone(pfn_to_page(start_pfn)); - __remove_pages(zone, start_pfn, nr_pages, altmap); + __remove_pages(start_pfn, nr_pages, altmap); } #endif diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 617c2777926f..f5535eae637f 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -151,10 +151,9 @@ void __ref arch_remove_memory(int nid, u64 start, u64 size, { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - struct page *page = pfn_to_page(start_pfn) + vmem_altmap_offset(altmap); int ret; - __remove_pages(page_zone(page), start_pfn, nr_pages, altmap); + __remove_pages(start_pfn, nr_pages, altmap); /* Remove htab bolted mappings for this section of memory */ start = (unsigned long)__va(start); diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index f0ce22220565..ac44bd76db4b 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -292,10 +292,8 @@ void arch_remove_memory(int nid, u64 start, u64 size, { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - struct zone *zone; - zone = page_zone(pfn_to_page(start_pfn)); - __remove_pages(zone, start_pfn, nr_pages, altmap); + __remove_pages(start_pfn, nr_pages, altmap); vmem_remove_mapping(start, size); } #endif /* CONFIG_MEMORY_HOTPLUG */ diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index dfdbaa50946e..d1b1ff2be17a 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -434,9 +434,7 @@ void arch_remove_memory(int nid, u64 start, u64 size, { unsigned long start_pfn = PFN_DOWN(start); unsigned long nr_pages = size >> PAGE_SHIFT; - struct zone *zone; - zone = page_zone(pfn_to_page(start_pfn)); - __remove_pages(zone, start_pfn, nr_pages, altmap); + __remove_pages(start_pfn, nr_pages, altmap); } #endif /* CONFIG_MEMORY_HOTPLUG */ diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 930edeb41ec3..0a74407ef92e 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -865,10 +865,8 @@ void arch_remove_memory(int nid, u64 start, u64 size, { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - struct zone *zone; - zone = page_zone(pfn_to_page(start_pfn)); - __remove_pages(zone, start_pfn, nr_pages, altmap); + __remove_pages(start_pfn, nr_pages, altmap); } #endif diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index dcb9bc961b39..bcfede46fe02 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1212,10 +1212,8 @@ void __ref arch_remove_memory(int nid, u64 start, u64 size, { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - struct page *page = pfn_to_page(start_pfn) + vmem_altmap_offset(altmap); - struct zone *zone = page_zone(page); - __remove_pages(zone, start_pfn, nr_pages, altmap); + __remove_pages(start_pfn, nr_pages, altmap); kernel_physical_mapping_remove(start, start + size); } #endif /* CONFIG_MEMORY_HOTPLUG */ diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 3a08ecdfca11..ba0dca6aac6e 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -122,8 +122,8 @@ static inline bool movable_node_is_enabled(void) extern void arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap); -extern void __remove_pages(struct zone *zone, unsigned long start_pfn, - unsigned long nr_pages, struct vmem_altmap *altmap); +extern void __remove_pages(unsigned long start_pfn, unsigned long nr_pages, + struct vmem_altmap *altmap); /* reasonably generic interface to expand the physical pages */ extern int __add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, @@ -342,6 +342,9 @@ extern int add_memory(int nid, u64 start, u64 size); extern int add_memory_resource(int nid, struct resource *resource); extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages, struct vmem_altmap *altmap); +extern void remove_pfn_range_from_zone(struct zone *zone, + unsigned long start_pfn, + unsigned long nr_pages); extern bool is_memblock_offlined(struct memory_block *mem); extern int sparse_add_section(int nid, unsigned long pfn, unsigned long nr_pages, struct vmem_altmap *altmap); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 55ac23ef11c1..a91a072f2b2c 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -483,8 +483,9 @@ static void update_pgdat_span(struct pglist_data *pgdat) pgdat->node_spanned_pages = node_end_pfn - node_start_pfn; } -static void __remove_zone(struct zone *zone, unsigned long start_pfn, - unsigned long nr_pages) +void __ref remove_pfn_range_from_zone(struct zone *zone, + unsigned long start_pfn, + unsigned long nr_pages) { struct pglist_data *pgdat = zone->zone_pgdat; unsigned long flags; @@ -499,28 +500,30 @@ static void __remove_zone(struct zone *zone, unsigned long start_pfn, return; #endif + clear_zone_contiguous(zone); + pgdat_resize_lock(zone->zone_pgdat, &flags); shrink_zone_span(zone, start_pfn, start_pfn + nr_pages); update_pgdat_span(pgdat); pgdat_resize_unlock(zone->zone_pgdat, &flags); + + set_zone_contiguous(zone); } -static void __remove_section(struct zone *zone, unsigned long pfn, - unsigned long nr_pages, unsigned long map_offset, - struct vmem_altmap *altmap) +static void __remove_section(unsigned long pfn, unsigned long nr_pages, + unsigned long map_offset, + struct vmem_altmap *altmap) { struct mem_section *ms = __nr_to_section(pfn_to_section_nr(pfn)); if (WARN_ON_ONCE(!valid_section(ms))) return; - __remove_zone(zone, pfn, nr_pages); sparse_remove_section(ms, pfn, nr_pages, map_offset, altmap); } /** - * __remove_pages() - remove sections of pages from a zone - * @zone: zone from which pages need to be removed + * __remove_pages() - remove sections of pages * @pfn: starting pageframe (must be aligned to start of a section) * @nr_pages: number of pages to remove (must be multiple of section size) * @altmap: alternative device page map or %NULL if default memmap is used @@ -530,16 +533,14 @@ static void __remove_section(struct zone *zone, unsigned long pfn, * sure that pages are marked reserved and zones are adjust properly by * calling offline_pages(). */ -void __remove_pages(struct zone *zone, unsigned long pfn, - unsigned long nr_pages, struct vmem_altmap *altmap) +void __remove_pages(unsigned long pfn, unsigned long nr_pages, + struct vmem_altmap *altmap) { unsigned long map_offset = 0; unsigned long nr, start_sec, end_sec; map_offset = vmem_altmap_offset(altmap); - clear_zone_contiguous(zone); - if (check_pfn_span(pfn, nr_pages, "remove")) return; @@ -551,13 +552,11 @@ void __remove_pages(struct zone *zone, unsigned long pfn, cond_resched(); pfns = min(nr_pages, PAGES_PER_SECTION - (pfn & ~PAGE_SECTION_MASK)); - __remove_section(zone, pfn, pfns, map_offset, altmap); + __remove_section(pfn, pfns, map_offset, altmap); pfn += pfns; nr_pages -= pfns; map_offset = 0; } - - set_zone_contiguous(zone); } int set_online_page_callback(online_page_callback_t callback) @@ -869,6 +868,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ (unsigned long long) pfn << PAGE_SHIFT, (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1); memory_notify(MEM_CANCEL_ONLINE, &arg); + remove_pfn_range_from_zone(zone, pfn, nr_pages); mem_hotplug_done(); return ret; } @@ -1628,6 +1628,7 @@ static int __ref __offline_pages(unsigned long start_pfn, writeback_set_ratelimit(); memory_notify(MEM_OFFLINE, &arg); + remove_pfn_range_from_zone(zone, start_pfn, nr_pages); mem_hotplug_done(); return 0; diff --git a/mm/memremap.c b/mm/memremap.c index 03ccbdfeb697..c51c6bd2fe34 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -120,7 +120,7 @@ void memunmap_pages(struct dev_pagemap *pgmap) mem_hotplug_begin(); if (pgmap->type == MEMORY_DEVICE_PRIVATE) { - __remove_pages(page_zone(first_page), PHYS_PFN(res->start), + __remove_pages(PHYS_PFN(res->start), PHYS_PFN(resource_size(res)), NULL); } else { arch_remove_memory(nid, res->start, resource_size(res), From ac8f05da5174c560de122c499ce5dfb5d0dfbee5 Mon Sep 17 00:00:00 2001 From: Chanho Min Date: Sat, 4 Jan 2020 12:59:36 -0800 Subject: [PATCH 126/146] mm/zsmalloc.c: fix the migrated zspage statistics. When zspage is migrated to the other zone, the zone page state should be updated as well, otherwise the NR_ZSPAGE for each zone shows wrong counts including proc/zoneinfo in practice. Link: http://lkml.kernel.org/r/1575434841-48009-1-git-send-email-chanho.min@lge.com Fixes: 91537fee0013 ("mm: add NR_ZSMALLOC to vmstat") Signed-off-by: Chanho Min Signed-off-by: Jinsuk Choi Reviewed-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: [4.9+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 2b2b9aae8a3c..22d17ecfe7df 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -2069,6 +2069,11 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage, zs_pool_dec_isolated(pool); } + if (page_zone(newpage) != page_zone(page)) { + dec_zone_page_state(page, NR_ZSPAGES); + inc_zone_page_state(newpage, NR_ZSPAGES); + } + reset_page(page); put_page(page); page = newpage; From a69b83e1ae7f6c5ff2cc310870c1708405d86be2 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Sat, 4 Jan 2020 12:59:39 -0800 Subject: [PATCH 127/146] kcov: fix struct layout for kcov_remote_arg Make the layout of kcov_remote_arg the same for 32-bit and 64-bit code. This makes it more convenient to write userspace apps that can be compiled into 32-bit or 64-bit binaries and still work with the same 64-bit kernel. Also use proper __u32 types in uapi headers instead of unsigned ints. Link: http://lkml.kernel.org/r/9e91020876029cfefc9211ff747685eba9536426.1575638983.git.andreyknvl@google.com Fixes: eec028c9386ed1a ("kcov: remote coverage support") Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Greg Kroah-Hartman Cc: Alan Stern Cc: Felipe Balbi Cc: Chunfeng Yun Cc: "Jacky . Cao @ sony . com" Cc: Dmitry Vyukov Cc: Alexander Potapenko Cc: Marco Elver Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/dev-tools/kcov.rst | 10 +++++----- include/uapi/linux/kcov.h | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/Documentation/dev-tools/kcov.rst b/Documentation/dev-tools/kcov.rst index 36890b026e77..1c4e1825d769 100644 --- a/Documentation/dev-tools/kcov.rst +++ b/Documentation/dev-tools/kcov.rst @@ -251,11 +251,11 @@ selectively from different subsystems. .. code-block:: c struct kcov_remote_arg { - unsigned trace_mode; - unsigned area_size; - unsigned num_handles; - uint64_t common_handle; - uint64_t handles[0]; + __u32 trace_mode; + __u32 area_size; + __u32 num_handles; + __aligned_u64 common_handle; + __aligned_u64 handles[0]; }; #define KCOV_INIT_TRACE _IOR('c', 1, unsigned long) diff --git a/include/uapi/linux/kcov.h b/include/uapi/linux/kcov.h index 409d3ad1e6e2..1d0350e44ae3 100644 --- a/include/uapi/linux/kcov.h +++ b/include/uapi/linux/kcov.h @@ -9,11 +9,11 @@ * and the comment before kcov_remote_start() for usage details. */ struct kcov_remote_arg { - unsigned int trace_mode; /* KCOV_TRACE_PC or KCOV_TRACE_CMP */ - unsigned int area_size; /* Length of coverage buffer in words */ - unsigned int num_handles; /* Size of handles array */ - __u64 common_handle; - __u64 handles[0]; + __u32 trace_mode; /* KCOV_TRACE_PC or KCOV_TRACE_CMP */ + __u32 area_size; /* Length of coverage buffer in words */ + __u32 num_handles; /* Size of handles array */ + __aligned_u64 common_handle; + __aligned_u64 handles[0]; }; #define KCOV_REMOTE_MAX_HANDLES 0x100 From 84029fd04c201a4c7e0b07ba262664900f47c6f5 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Sat, 4 Jan 2020 12:59:43 -0800 Subject: [PATCH 128/146] memcg: account security cred as well to kmemcg The cred_jar kmem_cache is already memcg accounted in the current kernel but cred->security is not. Account cred->security to kmemcg. Recently we saw high root slab usage on our production and on further inspection, we found a buggy application leaking processes. Though that buggy application was contained within its memcg but we observe much more system memory overhead, couple of GiBs, during that period. This overhead can adversely impact the isolation on the system. One source of high overhead we found was cred->security objects, which have a lifetime of at least the life of the process which allocated them. Link: http://lkml.kernel.org/r/20191205223721.40034-1-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Chris Down Reviewed-by: Roman Gushchin Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cred.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/cred.c b/kernel/cred.c index c0a4c12d38b2..9ed51b70ed80 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -223,7 +223,7 @@ struct cred *cred_alloc_blank(void) new->magic = CRED_MAGIC; #endif - if (security_cred_alloc_blank(new, GFP_KERNEL) < 0) + if (security_cred_alloc_blank(new, GFP_KERNEL_ACCOUNT) < 0) goto error; return new; @@ -282,7 +282,7 @@ struct cred *prepare_creds(void) new->security = NULL; #endif - if (security_prepare_creds(new, old, GFP_KERNEL) < 0) + if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0) goto error; validate_creds(new); return new; @@ -715,7 +715,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) #ifdef CONFIG_SECURITY new->security = NULL; #endif - if (security_prepare_creds(new, old, GFP_KERNEL) < 0) + if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0) goto error; put_cred(old); From e0153fc2c7606f101392b682e720a7a456d6c766 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Sat, 4 Jan 2020 12:59:46 -0800 Subject: [PATCH 129/146] mm: move_pages: return valid node id in status if the page is already on the target node Felix Abecassis reports move_pages() would return random status if the pages are already on the target node by the below test program: int main(void) { const long node_id = 1; const long page_size = sysconf(_SC_PAGESIZE); const int64_t num_pages = 8; unsigned long nodemask = 1 << node_id; long ret = set_mempolicy(MPOL_BIND, &nodemask, sizeof(nodemask)); if (ret < 0) return (EXIT_FAILURE); void **pages = malloc(sizeof(void*) * num_pages); for (int i = 0; i < num_pages; ++i) { pages[i] = mmap(NULL, page_size, PROT_WRITE | PROT_READ, MAP_PRIVATE | MAP_POPULATE | MAP_ANONYMOUS, -1, 0); if (pages[i] == MAP_FAILED) return (EXIT_FAILURE); } ret = set_mempolicy(MPOL_DEFAULT, NULL, 0); if (ret < 0) return (EXIT_FAILURE); int *nodes = malloc(sizeof(int) * num_pages); int *status = malloc(sizeof(int) * num_pages); for (int i = 0; i < num_pages; ++i) { nodes[i] = node_id; status[i] = 0xd0; /* simulate garbage values */ } ret = move_pages(0, num_pages, pages, nodes, status, MPOL_MF_MOVE); printf("move_pages: %ld\n", ret); for (int i = 0; i < num_pages; ++i) printf("status[%d] = %d\n", i, status[i]); } Then running the program would return nonsense status values: $ ./move_pages_bug move_pages: 0 status[0] = 208 status[1] = 208 status[2] = 208 status[3] = 208 status[4] = 208 status[5] = 208 status[6] = 208 status[7] = 208 This is because the status is not set if the page is already on the target node, but move_pages() should return valid status as long as it succeeds. The valid status may be errno or node id. We can't simply initialize status array to zero since the pages may be not on node 0. Fix it by updating status with node id which the page is already on. Link: http://lkml.kernel.org/r/1575584353-125392-1-git-send-email-yang.shi@linux.alibaba.com Fixes: a49bd4d71637 ("mm, numa: rework do_pages_move") Signed-off-by: Yang Shi Reported-by: Felix Abecassis Tested-by: Felix Abecassis Suggested-by: Michal Hocko Reviewed-by: John Hubbard Acked-by: Christoph Lameter Acked-by: Michal Hocko Reviewed-by: Vlastimil Babka Cc: Mel Gorman Cc: [4.17+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index eae1565285e3..86873b6f38a7 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1512,9 +1512,11 @@ static int do_move_pages_to_node(struct mm_struct *mm, /* * Resolves the given address to a struct page, isolates it from the LRU and * puts it to the given pagelist. - * Returns -errno if the page cannot be found/isolated or 0 when it has been - * queued or the page doesn't need to be migrated because it is already on - * the target node + * Returns: + * errno - if the page cannot be found/isolated + * 0 - when it doesn't have to be migrated because it is already on the + * target node + * 1 - when it has been queued */ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, int node, struct list_head *pagelist, bool migrate_all) @@ -1553,7 +1555,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, if (PageHuge(page)) { if (PageHead(page)) { isolate_huge_page(page, pagelist); - err = 0; + err = 1; } } else { struct page *head; @@ -1563,7 +1565,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, if (err) goto out_putpage; - err = 0; + err = 1; list_add_tail(&head->lru, pagelist); mod_node_page_state(page_pgdat(head), NR_ISOLATED_ANON + page_is_file_cache(head), @@ -1640,8 +1642,17 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, */ err = add_page_for_migration(mm, addr, current_node, &pagelist, flags & MPOL_MF_MOVE_ALL); - if (!err) + + if (!err) { + /* The page is already on the target node */ + err = store_status(status, i, current_node, 1); + if (err) + goto out_flush; continue; + } else if (err > 0) { + /* The page is successfully queued for migration */ + continue; + } err = store_status(status, i, err, 1); if (err) From b16155a0b01ae999add72b2ad2791b9c66285880 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 4 Jan 2020 12:59:49 -0800 Subject: [PATCH 130/146] fs/direct-io.c: include fs/internal.h for missing prototype Include fs/internal.h to address the following 'sparse' warning: fs/direct-io.c:591:5: warning: symbol 'sb_init_dio_done_wq' was not declared. Should it be static? Link: http://lkml.kernel.org/r/20191209234544.128302-1-ebiggers@kernel.org Signed-off-by: Eric Biggers Reviewed-by: Jan Kara Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/direct-io.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/direct-io.c b/fs/direct-io.c index 0ec4f270139f..00b4d15bb811 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -39,6 +39,8 @@ #include #include +#include "internal.h" + /* * How many user pages to map in one call to get_user_pages(). This determines * the size of a structure in the slab cache From 7bebd69ecf10787e6b9559ab780496f54943cc21 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 4 Jan 2020 12:59:52 -0800 Subject: [PATCH 131/146] fs/nsfs.c: include headers for missing declarations Include linux/proc_fs.h and fs/internal.h to address the following 'sparse' warnings: fs/nsfs.c:41:32: warning: symbol 'ns_dentry_operations' was not declared. Should it be static? fs/nsfs.c:145:5: warning: symbol 'open_related_ns' was not declared. Should it be static? Link: http://lkml.kernel.org/r/20191209234822.156179-1-ebiggers@kernel.org Signed-off-by: Eric Biggers Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nsfs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/nsfs.c b/fs/nsfs.c index a0431642c6b5..f75767bd623a 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -11,6 +12,8 @@ #include #include +#include "internal.h" + static struct vfsmount *nsfs_mnt; static long ns_ioctl(struct file *filp, unsigned int ioctl, From 213921f967cf44a7bceaee5535ff1d0196885076 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 4 Jan 2020 12:59:55 -0800 Subject: [PATCH 132/146] fs/namespace.c: make to_mnt_ns() static Make to_mnt_ns() static to address the following 'sparse' warning: fs/namespace.c:1731:22: warning: symbol 'to_mnt_ns' was not declared. Should it be static? Link: http://lkml.kernel.org/r/20191209234830.156260-1-ebiggers@kernel.org Signed-off-by: Eric Biggers Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/namespace.c b/fs/namespace.c index be601d3a8008..5e1bf611a9eb 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1728,7 +1728,7 @@ static bool is_mnt_ns_file(struct dentry *dentry) dentry->d_fsdata == &mntns_operations; } -struct mnt_namespace *to_mnt_ns(struct ns_common *ns) +static struct mnt_namespace *to_mnt_ns(struct ns_common *ns) { return container_of(ns, struct mnt_namespace, ns); } From 780a0cfda9006a9a22d6473c2d4c527f5c68eb2e Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Sat, 4 Jan 2020 12:59:59 -0800 Subject: [PATCH 133/146] hexagon: parenthesize registers in asm predicates Hexagon requires that register predicates in assembly be parenthesized. Link: https://github.com/ClangBuiltLinux/linux/issues/754 Link: http://lkml.kernel.org/r/20191209222956.239798-3-ndesaulniers@google.com Signed-off-by: Nick Desaulniers Suggested-by: Sid Manning Acked-by: Brian Cain Cc: Lee Jones Cc: Andy Shevchenko Cc: Tuowen Zhao Cc: Mika Westerberg Cc: Luis Chamberlain Cc: Greg Kroah-Hartman Cc: Alexios Zavras Cc: Allison Randal Cc: Will Deacon Cc: Richard Fontana Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Boqun Feng Cc: Ingo Molnar Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/hexagon/include/asm/atomic.h | 8 ++++---- arch/hexagon/include/asm/bitops.h | 8 ++++---- arch/hexagon/include/asm/cmpxchg.h | 2 +- arch/hexagon/include/asm/futex.h | 6 +++--- arch/hexagon/include/asm/spinlock.h | 20 ++++++++++---------- arch/hexagon/kernel/vm_entry.S | 2 +- 6 files changed, 23 insertions(+), 23 deletions(-) diff --git a/arch/hexagon/include/asm/atomic.h b/arch/hexagon/include/asm/atomic.h index 12cd9231c4b8..0231d69c8bf2 100644 --- a/arch/hexagon/include/asm/atomic.h +++ b/arch/hexagon/include/asm/atomic.h @@ -91,7 +91,7 @@ static inline void atomic_##op(int i, atomic_t *v) \ "1: %0 = memw_locked(%1);\n" \ " %0 = "#op "(%0,%2);\n" \ " memw_locked(%1,P3)=%0;\n" \ - " if !P3 jump 1b;\n" \ + " if (!P3) jump 1b;\n" \ : "=&r" (output) \ : "r" (&v->counter), "r" (i) \ : "memory", "p3" \ @@ -107,7 +107,7 @@ static inline int atomic_##op##_return(int i, atomic_t *v) \ "1: %0 = memw_locked(%1);\n" \ " %0 = "#op "(%0,%2);\n" \ " memw_locked(%1,P3)=%0;\n" \ - " if !P3 jump 1b;\n" \ + " if (!P3) jump 1b;\n" \ : "=&r" (output) \ : "r" (&v->counter), "r" (i) \ : "memory", "p3" \ @@ -124,7 +124,7 @@ static inline int atomic_fetch_##op(int i, atomic_t *v) \ "1: %0 = memw_locked(%2);\n" \ " %1 = "#op "(%0,%3);\n" \ " memw_locked(%2,P3)=%1;\n" \ - " if !P3 jump 1b;\n" \ + " if (!P3) jump 1b;\n" \ : "=&r" (output), "=&r" (val) \ : "r" (&v->counter), "r" (i) \ : "memory", "p3" \ @@ -173,7 +173,7 @@ static inline int atomic_fetch_add_unless(atomic_t *v, int a, int u) " }" " memw_locked(%2, p3) = %1;" " {" - " if !p3 jump 1b;" + " if (!p3) jump 1b;" " }" "2:" : "=&r" (__oldval), "=&r" (tmp) diff --git a/arch/hexagon/include/asm/bitops.h b/arch/hexagon/include/asm/bitops.h index 47384b094b94..71429f756af0 100644 --- a/arch/hexagon/include/asm/bitops.h +++ b/arch/hexagon/include/asm/bitops.h @@ -38,7 +38,7 @@ static inline int test_and_clear_bit(int nr, volatile void *addr) "1: R12 = memw_locked(R10);\n" " { P0 = tstbit(R12,R11); R12 = clrbit(R12,R11); }\n" " memw_locked(R10,P1) = R12;\n" - " {if !P1 jump 1b; %0 = mux(P0,#1,#0);}\n" + " {if (!P1) jump 1b; %0 = mux(P0,#1,#0);}\n" : "=&r" (oldval) : "r" (addr), "r" (nr) : "r10", "r11", "r12", "p0", "p1", "memory" @@ -62,7 +62,7 @@ static inline int test_and_set_bit(int nr, volatile void *addr) "1: R12 = memw_locked(R10);\n" " { P0 = tstbit(R12,R11); R12 = setbit(R12,R11); }\n" " memw_locked(R10,P1) = R12;\n" - " {if !P1 jump 1b; %0 = mux(P0,#1,#0);}\n" + " {if (!P1) jump 1b; %0 = mux(P0,#1,#0);}\n" : "=&r" (oldval) : "r" (addr), "r" (nr) : "r10", "r11", "r12", "p0", "p1", "memory" @@ -88,7 +88,7 @@ static inline int test_and_change_bit(int nr, volatile void *addr) "1: R12 = memw_locked(R10);\n" " { P0 = tstbit(R12,R11); R12 = togglebit(R12,R11); }\n" " memw_locked(R10,P1) = R12;\n" - " {if !P1 jump 1b; %0 = mux(P0,#1,#0);}\n" + " {if (!P1) jump 1b; %0 = mux(P0,#1,#0);}\n" : "=&r" (oldval) : "r" (addr), "r" (nr) : "r10", "r11", "r12", "p0", "p1", "memory" @@ -223,7 +223,7 @@ static inline int ffs(int x) int r; asm("{ P0 = cmp.eq(%1,#0); %0 = ct0(%1);}\n" - "{ if P0 %0 = #0; if !P0 %0 = add(%0,#1);}\n" + "{ if (P0) %0 = #0; if (!P0) %0 = add(%0,#1);}\n" : "=&r" (r) : "r" (x) : "p0"); diff --git a/arch/hexagon/include/asm/cmpxchg.h b/arch/hexagon/include/asm/cmpxchg.h index 6091322c3af9..92b8a02e588a 100644 --- a/arch/hexagon/include/asm/cmpxchg.h +++ b/arch/hexagon/include/asm/cmpxchg.h @@ -30,7 +30,7 @@ static inline unsigned long __xchg(unsigned long x, volatile void *ptr, __asm__ __volatile__ ( "1: %0 = memw_locked(%1);\n" /* load into retval */ " memw_locked(%1,P0) = %2;\n" /* store into memory */ - " if !P0 jump 1b;\n" + " if (!P0) jump 1b;\n" : "=&r" (retval) : "r" (ptr), "r" (x) : "memory", "p0" diff --git a/arch/hexagon/include/asm/futex.h b/arch/hexagon/include/asm/futex.h index cb635216a732..0191f7c7193e 100644 --- a/arch/hexagon/include/asm/futex.h +++ b/arch/hexagon/include/asm/futex.h @@ -16,7 +16,7 @@ /* For example: %1 = %4 */ \ insn \ "2: memw_locked(%3,p2) = %1;\n" \ - " if !p2 jump 1b;\n" \ + " if (!p2) jump 1b;\n" \ " %1 = #0;\n" \ "3:\n" \ ".section .fixup,\"ax\"\n" \ @@ -84,10 +84,10 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval, "1: %1 = memw_locked(%3)\n" " {\n" " p2 = cmp.eq(%1,%4)\n" - " if !p2.new jump:NT 3f\n" + " if (!p2.new) jump:NT 3f\n" " }\n" "2: memw_locked(%3,p2) = %5\n" - " if !p2 jump 1b\n" + " if (!p2) jump 1b\n" "3:\n" ".section .fixup,\"ax\"\n" "4: %0 = #%6\n" diff --git a/arch/hexagon/include/asm/spinlock.h b/arch/hexagon/include/asm/spinlock.h index bfe07d842ff3..ef103b73bec8 100644 --- a/arch/hexagon/include/asm/spinlock.h +++ b/arch/hexagon/include/asm/spinlock.h @@ -30,9 +30,9 @@ static inline void arch_read_lock(arch_rwlock_t *lock) __asm__ __volatile__( "1: R6 = memw_locked(%0);\n" " { P3 = cmp.ge(R6,#0); R6 = add(R6,#1);}\n" - " { if !P3 jump 1b; }\n" + " { if (!P3) jump 1b; }\n" " memw_locked(%0,P3) = R6;\n" - " { if !P3 jump 1b; }\n" + " { if (!P3) jump 1b; }\n" : : "r" (&lock->lock) : "memory", "r6", "p3" @@ -46,7 +46,7 @@ static inline void arch_read_unlock(arch_rwlock_t *lock) "1: R6 = memw_locked(%0);\n" " R6 = add(R6,#-1);\n" " memw_locked(%0,P3) = R6\n" - " if !P3 jump 1b;\n" + " if (!P3) jump 1b;\n" : : "r" (&lock->lock) : "memory", "r6", "p3" @@ -61,7 +61,7 @@ static inline int arch_read_trylock(arch_rwlock_t *lock) __asm__ __volatile__( " R6 = memw_locked(%1);\n" " { %0 = #0; P3 = cmp.ge(R6,#0); R6 = add(R6,#1);}\n" - " { if !P3 jump 1f; }\n" + " { if (!P3) jump 1f; }\n" " memw_locked(%1,P3) = R6;\n" " { %0 = P3 }\n" "1:\n" @@ -78,9 +78,9 @@ static inline void arch_write_lock(arch_rwlock_t *lock) __asm__ __volatile__( "1: R6 = memw_locked(%0)\n" " { P3 = cmp.eq(R6,#0); R6 = #-1;}\n" - " { if !P3 jump 1b; }\n" + " { if (!P3) jump 1b; }\n" " memw_locked(%0,P3) = R6;\n" - " { if !P3 jump 1b; }\n" + " { if (!P3) jump 1b; }\n" : : "r" (&lock->lock) : "memory", "r6", "p3" @@ -94,7 +94,7 @@ static inline int arch_write_trylock(arch_rwlock_t *lock) __asm__ __volatile__( " R6 = memw_locked(%1)\n" " { %0 = #0; P3 = cmp.eq(R6,#0); R6 = #-1;}\n" - " { if !P3 jump 1f; }\n" + " { if (!P3) jump 1f; }\n" " memw_locked(%1,P3) = R6;\n" " %0 = P3;\n" "1:\n" @@ -117,9 +117,9 @@ static inline void arch_spin_lock(arch_spinlock_t *lock) __asm__ __volatile__( "1: R6 = memw_locked(%0);\n" " P3 = cmp.eq(R6,#0);\n" - " { if !P3 jump 1b; R6 = #1; }\n" + " { if (!P3) jump 1b; R6 = #1; }\n" " memw_locked(%0,P3) = R6;\n" - " { if !P3 jump 1b; }\n" + " { if (!P3) jump 1b; }\n" : : "r" (&lock->lock) : "memory", "r6", "p3" @@ -139,7 +139,7 @@ static inline unsigned int arch_spin_trylock(arch_spinlock_t *lock) __asm__ __volatile__( " R6 = memw_locked(%1);\n" " P3 = cmp.eq(R6,#0);\n" - " { if !P3 jump 1f; R6 = #1; %0 = #0; }\n" + " { if (!P3) jump 1f; R6 = #1; %0 = #0; }\n" " memw_locked(%1,P3) = R6;\n" " %0 = P3;\n" "1:\n" diff --git a/arch/hexagon/kernel/vm_entry.S b/arch/hexagon/kernel/vm_entry.S index 12242c27e2df..4023fdbea490 100644 --- a/arch/hexagon/kernel/vm_entry.S +++ b/arch/hexagon/kernel/vm_entry.S @@ -369,7 +369,7 @@ ret_from_fork: R26.L = #LO(do_work_pending); R0 = #VM_INT_DISABLE; } - if P0 jump check_work_pending + if (P0) jump check_work_pending { R0 = R25; callr R24 From 63e80314ab7cf4783526d2e44ee57a90514911c9 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Sat, 4 Jan 2020 13:00:02 -0800 Subject: [PATCH 134/146] hexagon: work around compiler crash Clang cannot translate the string "r30" into a valid register yet. Link: https://github.com/ClangBuiltLinux/linux/issues/755 Link: http://lkml.kernel.org/r/20191028155722.23419-1-ndesaulniers@google.com Signed-off-by: Nick Desaulniers Suggested-by: Sid Manning Reviewed-by: Brian Cain Cc: Allison Randal Cc: Greg Kroah-Hartman Cc: Richard Fontana Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/hexagon/kernel/stacktrace.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/hexagon/kernel/stacktrace.c b/arch/hexagon/kernel/stacktrace.c index 35f29423fda8..5ed02f699479 100644 --- a/arch/hexagon/kernel/stacktrace.c +++ b/arch/hexagon/kernel/stacktrace.c @@ -11,8 +11,6 @@ #include #include -register unsigned long current_frame_pointer asm("r30"); - struct stackframe { unsigned long fp; unsigned long rets; @@ -30,7 +28,7 @@ void save_stack_trace(struct stack_trace *trace) low = (unsigned long)task_stack_page(current); high = low + THREAD_SIZE; - fp = current_frame_pointer; + fp = (unsigned long)__builtin_frame_address(0); while (fp >= low && fp <= (high - sizeof(*frame))) { frame = (struct stackframe *)fp; From e39e773ad100ac94f8358d862f20101e802ae54c Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 4 Jan 2020 13:00:05 -0800 Subject: [PATCH 135/146] fs/posix_acl.c: fix kernel-doc warnings Fix kernel-doc warnings in fs/posix_acl.c. Also fix one typo (setgit -> setgid). fs/posix_acl.c:647: warning: Function parameter or member 'inode' not described in 'posix_acl_update_mode' fs/posix_acl.c:647: warning: Function parameter or member 'mode_p' not described in 'posix_acl_update_mode' fs/posix_acl.c:647: warning: Function parameter or member 'acl' not described in 'posix_acl_update_mode' Link: http://lkml.kernel.org/r/29b0dc46-1f28-a4e5-b1d0-ba2b65629779@infradead.org Fixes: 073931017b49d ("posix_acl: Clear SGID bit when setting file permissions") Signed-off-by: Randy Dunlap Acked-by: Andreas Gruenbacher Reviewed-by: Jan Kara Cc: Jan Kara Cc: Andreas Gruenbacher Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/posix_acl.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 84ad1c90d535..249672bf54fe 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -631,12 +631,15 @@ EXPORT_SYMBOL_GPL(posix_acl_create); /** * posix_acl_update_mode - update mode in set_acl + * @inode: target inode + * @mode_p: mode (pointer) for update + * @acl: acl pointer * * Update the file mode when setting an ACL: compute the new file permission * bits based on the ACL. In addition, if the ACL is equivalent to the new - * file mode, set *acl to NULL to indicate that no ACL should be set. + * file mode, set *@acl to NULL to indicate that no ACL should be set. * - * As with chmod, clear the setgit bit if the caller is not in the owning group + * As with chmod, clear the setgid bit if the caller is not in the owning group * or capable of CAP_FSETID (see inode_change_ok). * * Called from set_acl inode operations. From 941f762bcb276259a78e7931674668874ccbda59 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sat, 4 Jan 2020 13:00:09 -0800 Subject: [PATCH 136/146] mm/oom: fix pgtables units mismatch in Killed process message pr_err() expects kB, but mm_pgtables_bytes() returns the number of bytes. As everything else is printed in kB, I chose to fix the value rather than the string. Before: [ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name ... [ 1878] 1000 1878 217253 151144 1269760 0 0 python ... Out of memory: Killed process 1878 (python) total-vm:869012kB, anon-rss:604572kB, file-rss:4kB, shmem-rss:0kB, UID:1000 pgtables:1269760kB oom_score_adj:0 After: [ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name ... [ 1436] 1000 1436 217253 151890 1294336 0 0 python ... Out of memory: Killed process 1436 (python) total-vm:869012kB, anon-rss:607516kB, file-rss:44kB, shmem-rss:0kB, UID:1000 pgtables:1264kB oom_score_adj:0 Link: http://lkml.kernel.org/r/20191211202830.1600-1-idryomov@gmail.com Fixes: 70cb6d267790 ("mm/oom: add oom_score_adj and pgtables to Killed process message") Signed-off-by: Ilya Dryomov Reviewed-by: Andrew Morton Acked-by: David Rientjes Acked-by: Michal Hocko Cc: Edward Chron Cc: David Rientjes Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 71e3acea7817..d58c481b3df8 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -890,7 +890,7 @@ static void __oom_kill_process(struct task_struct *victim, const char *message) K(get_mm_counter(mm, MM_FILEPAGES)), K(get_mm_counter(mm, MM_SHMEMPAGES)), from_kuid(&init_user_ns, task_uid(victim)), - mm_pgtables_bytes(mm), victim->signal->oom_score_adj); + mm_pgtables_bytes(mm) >> 10, victim->signal->oom_score_adj); task_unlock(victim); /* From a7c46c0c0e3d62f2764cd08b90934cd2aaaf8545 Mon Sep 17 00:00:00 2001 From: Navid Emamdoost Date: Sat, 4 Jan 2020 13:00:12 -0800 Subject: [PATCH 137/146] mm/gup: fix memory leak in __gup_benchmark_ioctl In the implementation of __gup_benchmark_ioctl() the allocated pages should be released before returning in case of an invalid cmd. Release pages via kvfree(). [akpm@linux-foundation.org: rework code flow, return -EINVAL rather than -1] Link: http://lkml.kernel.org/r/20191211174653.4102-1-navid.emamdoost@gmail.com Fixes: 714a3a1ebafe ("mm/gup_benchmark.c: add additional pinning methods") Signed-off-by: Navid Emamdoost Reviewed-by: Andrew Morton Reviewed-by: Ira Weiny Reviewed-by: John Hubbard Cc: Keith Busch Cc: Kirill A. Shutemov Cc: Dave Hansen Cc: Dan Williams Cc: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup_benchmark.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c index 7dd602d7f8db..ad9d5b1c4473 100644 --- a/mm/gup_benchmark.c +++ b/mm/gup_benchmark.c @@ -26,6 +26,7 @@ static int __gup_benchmark_ioctl(unsigned int cmd, unsigned long i, nr_pages, addr, next; int nr; struct page **pages; + int ret = 0; if (gup->size > ULONG_MAX) return -EINVAL; @@ -63,7 +64,9 @@ static int __gup_benchmark_ioctl(unsigned int cmd, NULL); break; default: - return -1; + kvfree(pages); + ret = -EINVAL; + goto out; } if (nr <= 0) @@ -85,7 +88,8 @@ static int __gup_benchmark_ioctl(unsigned int cmd, gup->put_delta_usec = ktime_us_delta(end_time, start_time); kvfree(pages); - return 0; +out: + return ret; } static long gup_benchmark_ioctl(struct file *filep, unsigned int cmd, From c77c0a8ac4c522638a8242fcb9de9496e3cdbb2d Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Sat, 4 Jan 2020 13:00:15 -0800 Subject: [PATCH 138/146] mm/hugetlb: defer freeing of huge pages if in non-task context The following lockdep splat was observed when a certain hugetlbfs test was run: ================================ WARNING: inconsistent lock state 4.18.0-159.el8.x86_64+debug #1 Tainted: G W --------- - - -------------------------------- inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage. swapper/30/0 [HC0[0]:SC1[1]:HE1:SE0] takes: ffffffff9acdc038 (hugetlb_lock){+.?.}, at: free_huge_page+0x36f/0xaa0 {SOFTIRQ-ON-W} state was registered at: lock_acquire+0x14f/0x3b0 _raw_spin_lock+0x30/0x70 __nr_hugepages_store_common+0x11b/0xb30 hugetlb_sysctl_handler_common+0x209/0x2d0 proc_sys_call_handler+0x37f/0x450 vfs_write+0x157/0x460 ksys_write+0xb8/0x170 do_syscall_64+0xa5/0x4d0 entry_SYSCALL_64_after_hwframe+0x6a/0xdf irq event stamp: 691296 hardirqs last enabled at (691296): [] _raw_spin_unlock_irqrestore+0x4b/0x60 hardirqs last disabled at (691295): [] _raw_spin_lock_irqsave+0x22/0x81 softirqs last enabled at (691284): [] irq_enter+0xc3/0xe0 softirqs last disabled at (691285): [] irq_exit+0x23e/0x2b0 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(hugetlb_lock); lock(hugetlb_lock); *** DEADLOCK *** : Call Trace: __lock_acquire+0x146b/0x48c0 lock_acquire+0x14f/0x3b0 _raw_spin_lock+0x30/0x70 free_huge_page+0x36f/0xaa0 bio_check_pages_dirty+0x2fc/0x5c0 clone_endio+0x17f/0x670 [dm_mod] blk_update_request+0x276/0xe50 scsi_end_request+0x7b/0x6a0 scsi_io_completion+0x1c6/0x1570 blk_done_softirq+0x22e/0x350 __do_softirq+0x23d/0xad8 irq_exit+0x23e/0x2b0 do_IRQ+0x11a/0x200 common_interrupt+0xf/0xf Both the hugetbl_lock and the subpool lock can be acquired in free_huge_page(). One way to solve the problem is to make both locks irq-safe. However, Mike Kravetz had learned that the hugetlb_lock is held for a linear scan of ALL hugetlb pages during a cgroup reparentling operation. So it is just too long to have irq disabled unless we can break hugetbl_lock down into finer-grained locks with shorter lock hold times. Another alternative is to defer the freeing to a workqueue job. This patch implements the deferred freeing by adding a free_hpage_workfn() work function to do the actual freeing. The free_huge_page() call in a non-task context saves the page to be freed in the hpage_freelist linked list in a lockless manner using the llist APIs. The generic workqueue is used to process the work, but a dedicated workqueue can be used instead if it is desirable to have the huge page freed ASAP. Thanks to Kirill Tkhai for suggesting the use of llist APIs which simplfy the code. Link: http://lkml.kernel.org/r/20191217170331.30893-1-longman@redhat.com Signed-off-by: Waiman Long Reviewed-by: Mike Kravetz Acked-by: Davidlohr Bueso Acked-by: Michal Hocko Reviewed-by: Kirill Tkhai Cc: Aneesh Kumar K.V Cc: Matthew Wilcox Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ac65bb5e38ac..dd8737a94bec 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -1136,7 +1137,7 @@ static inline void ClearPageHugeTemporary(struct page *page) page[2].mapping = NULL; } -void free_huge_page(struct page *page) +static void __free_huge_page(struct page *page) { /* * Can't pass hstate in here because it is called from the @@ -1199,6 +1200,54 @@ void free_huge_page(struct page *page) spin_unlock(&hugetlb_lock); } +/* + * As free_huge_page() can be called from a non-task context, we have + * to defer the actual freeing in a workqueue to prevent potential + * hugetlb_lock deadlock. + * + * free_hpage_workfn() locklessly retrieves the linked list of pages to + * be freed and frees them one-by-one. As the page->mapping pointer is + * going to be cleared in __free_huge_page() anyway, it is reused as the + * llist_node structure of a lockless linked list of huge pages to be freed. + */ +static LLIST_HEAD(hpage_freelist); + +static void free_hpage_workfn(struct work_struct *work) +{ + struct llist_node *node; + struct page *page; + + node = llist_del_all(&hpage_freelist); + + while (node) { + page = container_of((struct address_space **)node, + struct page, mapping); + node = node->next; + __free_huge_page(page); + } +} +static DECLARE_WORK(free_hpage_work, free_hpage_workfn); + +void free_huge_page(struct page *page) +{ + /* + * Defer freeing if in non-task context to avoid hugetlb_lock deadlock. + */ + if (!in_task()) { + /* + * Only call schedule_work() if hpage_freelist is previously + * empty. Otherwise, schedule_work() had been called but the + * workfn hasn't retrieved the list yet. + */ + if (llist_add((struct llist_node *)&page->mapping, + &hpage_freelist)) + schedule_work(&free_hpage_work); + return; + } + + __free_huge_page(page); +} + static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) { INIT_LIST_HEAD(&page->lru); From 397eac17f86f404f5ba31d8c3e39ec3124b39fd3 Mon Sep 17 00:00:00 2001 From: Kai Li Date: Sat, 4 Jan 2020 13:00:18 -0800 Subject: [PATCH 139/146] ocfs2: call journal flush to mark journal as empty after journal recovery when mount If journal is dirty when mount, it will be replayed but jbd2 sb log tail cannot be updated to mark a new start because journal->j_flag has already been set with JBD2_ABORT first in journal_init_common. When a new transaction is committed, it will be recored in block 1 first(journal->j_tail is set to 1 in journal_reset). If emergency restart happens again before journal super block is updated unfortunately, the new recorded trans will not be replayed in the next mount. The following steps describe this procedure in detail. 1. mount and touch some files 2. these transactions are committed to journal area but not checkpointed 3. emergency restart 4. mount again and its journals are replayed 5. journal super block's first s_start is 1, but its s_seq is not updated 6. touch a new file and its trans is committed but not checkpointed 7. emergency restart again 8. mount and journal is dirty, but trans committed in 6 will not be replayed. This exception happens easily when this lun is used by only one node. If it is used by multi-nodes, other node will replay its journal and its journal super block will be updated after recovery like what this patch does. ocfs2_recover_node->ocfs2_replay_journal. The following jbd2 journal can be generated by touching a new file after journal is replayed, and seq 15 is the first valid commit, but first seq is 13 in journal super block. logdump: Block 0: Journal Superblock Seq: 0 Type: 4 (JBD2_SUPERBLOCK_V2) Blocksize: 4096 Total Blocks: 32768 First Block: 1 First Commit ID: 13 Start Log Blknum: 1 Error: 0 Feature Compat: 0 Feature Incompat: 2 block64 Feature RO compat: 0 Journal UUID: 4ED3822C54294467A4F8E87D2BA4BC36 FS Share Cnt: 1 Dynamic Superblk Blknum: 0 Per Txn Block Limit Journal: 0 Data: 0 Block 1: Journal Commit Block Seq: 14 Type: 2 (JBD2_COMMIT_BLOCK) Block 2: Journal Descriptor Seq: 15 Type: 1 (JBD2_DESCRIPTOR_BLOCK) No. Blocknum Flags 0. 587 none UUID: 00000000000000000000000000000000 1. 8257792 JBD2_FLAG_SAME_UUID 2. 619 JBD2_FLAG_SAME_UUID 3. 24772864 JBD2_FLAG_SAME_UUID 4. 8257802 JBD2_FLAG_SAME_UUID 5. 513 JBD2_FLAG_SAME_UUID JBD2_FLAG_LAST_TAG ... Block 7: Inode Inode: 8257802 Mode: 0640 Generation: 57157641 (0x3682809) FS Generation: 2839773110 (0xa9437fb6) CRC32: 00000000 ECC: 0000 Type: Regular Attr: 0x0 Flags: Valid Dynamic Features: (0x1) InlineData User: 0 (root) Group: 0 (root) Size: 7 Links: 1 Clusters: 0 ctime: 0x5de5d870 0x11104c61 -- Tue Dec 3 11:37:20.286280801 2019 atime: 0x5de5d870 0x113181a1 -- Tue Dec 3 11:37:20.288457121 2019 mtime: 0x5de5d870 0x11104c61 -- Tue Dec 3 11:37:20.286280801 2019 dtime: 0x0 -- Thu Jan 1 08:00:00 1970 ... Block 9: Journal Commit Block Seq: 15 Type: 2 (JBD2_COMMIT_BLOCK) The following is journal recovery log when recovering the upper jbd2 journal when mount again. syslog: ocfs2: File system on device (252,1) was not unmounted cleanly, recovering it. fs/jbd2/recovery.c:(do_one_pass, 449): Starting recovery pass 0 fs/jbd2/recovery.c:(do_one_pass, 449): Starting recovery pass 1 fs/jbd2/recovery.c:(do_one_pass, 449): Starting recovery pass 2 fs/jbd2/recovery.c:(jbd2_journal_recover, 278): JBD2: recovery, exit status 0, recovered transactions 13 to 13 Due to first commit seq 13 recorded in journal super is not consistent with the value recorded in block 1(seq is 14), journal recovery will be terminated before seq 15 even though it is an unbroken commit, inode 8257802 is a new file and it will be lost. Link: http://lkml.kernel.org/r/20191217020140.2197-1-li.kai4@h3c.com Signed-off-by: Kai Li Reviewed-by: Joseph Qi Reviewed-by: Changwei Ge Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/journal.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 1afe57f425a0..68ba354cf361 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1066,6 +1066,14 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed) ocfs2_clear_journal_error(osb->sb, journal->j_journal, osb->slot_num); + if (replayed) { + jbd2_journal_lock_updates(journal->j_journal); + status = jbd2_journal_flush(journal->j_journal); + jbd2_journal_unlock_updates(journal->j_journal); + if (status < 0) + mlog_errno(status); + } + status = ocfs2_journal_toggle_dirty(osb, 1, replayed); if (status < 0) { mlog_errno(status); From b73eba2a867e10b9b4477738677341f3307c07bb Mon Sep 17 00:00:00 2001 From: Gang He Date: Sat, 4 Jan 2020 13:00:22 -0800 Subject: [PATCH 140/146] ocfs2: fix the crash due to call ocfs2_get_dlm_debug once less Because ocfs2_get_dlm_debug() function is called once less here, ocfs2 file system will trigger the system crash, usually after ocfs2 file system is unmounted. This system crash is caused by a generic memory corruption, these crash backtraces are not always the same, for exapmle, ocfs2: Unmounting device (253,16) on (node 172167785) general protection fault: 0000 [#1] SMP PTI CPU: 3 PID: 14107 Comm: fence_legacy Kdump: Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) RIP: 0010:__kmalloc+0xa5/0x2a0 Code: 00 00 4d 8b 07 65 4d 8b RSP: 0018:ffffaa1fc094bbe8 EFLAGS: 00010286 RAX: 0000000000000000 RBX: d310a8800d7a3faf RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000dc0 RDI: ffff96e68fc036c0 RBP: d310a8800d7a3faf R08: ffff96e6ffdb10a0 R09: 00000000752e7079 R10: 000000000001c513 R11: 0000000004091041 R12: 0000000000000dc0 R13: 0000000000000039 R14: ffff96e68fc036c0 R15: ffff96e68fc036c0 FS: 00007f699dfba540(0000) GS:ffff96e6ffd80000(0000) knlGS:00000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055f3a9d9b768 CR3: 000000002cd1c000 CR4: 00000000000006e0 Call Trace: ext4_htree_store_dirent+0x35/0x100 [ext4] htree_dirblock_to_tree+0xea/0x290 [ext4] ext4_htree_fill_tree+0x1c1/0x2d0 [ext4] ext4_readdir+0x67c/0x9d0 [ext4] iterate_dir+0x8d/0x1a0 __x64_sys_getdents+0xab/0x130 do_syscall_64+0x60/0x1f0 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x7f699d33a9fb This regression problem was introduced by commit e581595ea29c ("ocfs: no need to check return value of debugfs_create functions"). Link: http://lkml.kernel.org/r/20191225061501.13587-1-ghe@suse.com Fixes: e581595ea29c ("ocfs: no need to check return value of debugfs_create functions") Signed-off-by: Gang He Acked-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Cc: [5.3+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlmglue.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 1c4c51f3df60..cda1027d0819 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -3282,6 +3282,7 @@ static void ocfs2_dlm_init_debug(struct ocfs2_super *osb) debugfs_create_u32("locking_filter", 0600, osb->osb_debug_root, &dlm_debug->d_filter_secs); + ocfs2_get_dlm_debug(dlm_debug); } static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb) From 7312b70699252074d753c5005fc67266c547bbe3 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Sat, 4 Jan 2020 13:00:26 -0800 Subject: [PATCH 141/146] hexagon: define ioremap_uc Similar to commit 38e45d81d14e ("sparc64: implement ioremap_uc") define ioremap_uc for hexagon to avoid errors from -Wimplicit-function-definition. Link: http://lkml.kernel.org/r/20191209222956.239798-2-ndesaulniers@google.com Link: https://github.com/ClangBuiltLinux/linux/issues/797 Fixes: e537654b7039 ("lib: devres: add a helper function for ioremap_uc") Signed-off-by: Nick Desaulniers Suggested-by: Nathan Chancellor Acked-by: Brian Cain Cc: Lee Jones Cc: Andy Shevchenko Cc: Tuowen Zhao Cc: Mika Westerberg Cc: Luis Chamberlain Cc: Greg Kroah-Hartman Cc: Alexios Zavras Cc: Allison Randal Cc: Will Deacon Cc: Richard Fontana Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Boqun Feng Cc: Ingo Molnar Cc: Geert Uytterhoeven Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/hexagon/include/asm/io.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/hexagon/include/asm/io.h b/arch/hexagon/include/asm/io.h index 539e3efcf39c..b0dbc3473172 100644 --- a/arch/hexagon/include/asm/io.h +++ b/arch/hexagon/include/asm/io.h @@ -173,6 +173,7 @@ static inline void writel(u32 data, volatile void __iomem *addr) void __iomem *ioremap(unsigned long phys_addr, unsigned long size); #define ioremap_nocache ioremap +#define ioremap_uc(X, Y) ioremap((X), (Y)) #define __raw_writel writel From 8c62ed27a12c00e3db1c9f04bc0f272bdbb06734 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Thu, 2 Jan 2020 05:31:22 -0800 Subject: [PATCH 142/146] apparmor: fix aa_xattrs_match() may sleep while holding a RCU lock aa_xattrs_match() is unfortunately calling vfs_getxattr_alloc() from a context protected by an rcu_read_lock. This can not be done as vfs_getxattr_alloc() may sleep regardles of the gfp_t value being passed to it. Fix this by breaking the rcu_read_lock on the policy search when the xattr match feature is requested and restarting the search if a policy changes occur. Fixes: 8e51f9087f40 ("apparmor: Add support for attaching profiles via xattr, presence and value") Reported-by: Jia-Ju Bai Reported-by: Al Viro Signed-off-by: John Johansen --- security/apparmor/apparmorfs.c | 2 +- security/apparmor/domain.c | 80 ++++++++++++++++++---------------- security/apparmor/policy.c | 4 +- 3 files changed, 45 insertions(+), 41 deletions(-) diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index 09996f2552ee..47aff8700547 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -623,7 +623,7 @@ static __poll_t ns_revision_poll(struct file *file, poll_table *pt) void __aa_bump_ns_revision(struct aa_ns *ns) { - ns->revision++; + WRITE_ONCE(ns->revision, ns->revision + 1); wake_up_interruptible(&ns->wait); } diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c index 9be7ccb8379e..6ceb74e0f789 100644 --- a/security/apparmor/domain.c +++ b/security/apparmor/domain.c @@ -317,6 +317,7 @@ static int aa_xattrs_match(const struct linux_binprm *bprm, if (!bprm || !profile->xattr_count) return 0; + might_sleep(); /* transition from exec match to xattr set */ state = aa_dfa_null_transition(profile->xmatch, state); @@ -361,10 +362,11 @@ static int aa_xattrs_match(const struct linux_binprm *bprm, } /** - * __attach_match_ - find an attachment match + * find_attach - do attachment search for unconfined processes * @bprm - binprm structure of transitioning task - * @name - to match against (NOT NULL) + * @ns: the current namespace (NOT NULL) * @head - profile list to walk (NOT NULL) + * @name - to match against (NOT NULL) * @info - info message if there was an error (NOT NULL) * * Do a linear search on the profiles in the list. There is a matching @@ -374,12 +376,11 @@ static int aa_xattrs_match(const struct linux_binprm *bprm, * * Requires: @head not be shared or have appropriate locks held * - * Returns: profile or NULL if no match found + * Returns: label or NULL if no match found */ -static struct aa_profile *__attach_match(const struct linux_binprm *bprm, - const char *name, - struct list_head *head, - const char **info) +static struct aa_label *find_attach(const struct linux_binprm *bprm, + struct aa_ns *ns, struct list_head *head, + const char *name, const char **info) { int candidate_len = 0, candidate_xattrs = 0; bool conflict = false; @@ -388,6 +389,8 @@ static struct aa_profile *__attach_match(const struct linux_binprm *bprm, AA_BUG(!name); AA_BUG(!head); + rcu_read_lock(); +restart: list_for_each_entry_rcu(profile, head, base.list) { if (profile->label.flags & FLAG_NULL && &profile->label == ns_unconfined(profile->ns)) @@ -413,16 +416,32 @@ static struct aa_profile *__attach_match(const struct linux_binprm *bprm, perm = dfa_user_allow(profile->xmatch, state); /* any accepting state means a valid match. */ if (perm & MAY_EXEC) { - int ret; + int ret = 0; if (count < candidate_len) continue; - ret = aa_xattrs_match(bprm, profile, state); - /* Fail matching if the xattrs don't match */ - if (ret < 0) - continue; + if (bprm && profile->xattr_count) { + long rev = READ_ONCE(ns->revision); + if (!aa_get_profile_not0(profile)) + goto restart; + rcu_read_unlock(); + ret = aa_xattrs_match(bprm, profile, + state); + rcu_read_lock(); + aa_put_profile(profile); + if (rev != + READ_ONCE(ns->revision)) + /* policy changed */ + goto restart; + /* + * Fail matching if the xattrs don't + * match + */ + if (ret < 0) + continue; + } /* * TODO: allow for more flexible best match * @@ -445,43 +464,28 @@ static struct aa_profile *__attach_match(const struct linux_binprm *bprm, candidate_xattrs = ret; conflict = false; } - } else if (!strcmp(profile->base.name, name)) + } else if (!strcmp(profile->base.name, name)) { /* * old exact non-re match, without conditionals such * as xattrs. no more searching required */ - return profile; + candidate = profile; + goto out; + } } - if (conflict) { - *info = "conflicting profile attachments"; + if (!candidate || conflict) { + if (conflict) + *info = "conflicting profile attachments"; + rcu_read_unlock(); return NULL; } - return candidate; -} - -/** - * find_attach - do attachment search for unconfined processes - * @bprm - binprm structure of transitioning task - * @ns: the current namespace (NOT NULL) - * @list: list to search (NOT NULL) - * @name: the executable name to match against (NOT NULL) - * @info: info message if there was an error - * - * Returns: label or NULL if no match found - */ -static struct aa_label *find_attach(const struct linux_binprm *bprm, - struct aa_ns *ns, struct list_head *list, - const char *name, const char **info) -{ - struct aa_profile *profile; - - rcu_read_lock(); - profile = aa_get_profile(__attach_match(bprm, name, list, info)); +out: + candidate = aa_get_newest_profile(candidate); rcu_read_unlock(); - return profile ? &profile->label : NULL; + return &candidate->label; } static const char *next_name(int xtype, const char *name) diff --git a/security/apparmor/policy.c b/security/apparmor/policy.c index 03104830c913..269f2f53c0b1 100644 --- a/security/apparmor/policy.c +++ b/security/apparmor/policy.c @@ -1125,8 +1125,8 @@ ssize_t aa_remove_profiles(struct aa_ns *policy_ns, struct aa_label *subj, if (!name) { /* remove namespace - can only happen if fqname[0] == ':' */ mutex_lock_nested(&ns->parent->lock, ns->level); - __aa_remove_ns(ns); __aa_bump_ns_revision(ns); + __aa_remove_ns(ns); mutex_unlock(&ns->parent->lock); } else { /* remove profile */ @@ -1138,9 +1138,9 @@ ssize_t aa_remove_profiles(struct aa_ns *policy_ns, struct aa_label *subj, goto fail_ns_lock; } name = profile->base.hname; + __aa_bump_ns_revision(ns); __remove_profile(profile); __aa_labelset_update_subtree(ns); - __aa_bump_ns_revision(ns); mutex_unlock(&ns->lock); } From 9d05c18e8d7de566ff68f221fcae65e78708dd1d Mon Sep 17 00:00:00 2001 From: Zong Li Date: Mon, 23 Dec 2019 16:46:14 +0800 Subject: [PATCH 143/146] clocksource: riscv: add notrace to riscv_sched_clock When enabling ftrace graph tracer, it gets the tracing clock in ftrace_push_return_trace(). Eventually, it invokes riscv_sched_clock() to get the clock value. If riscv_sched_clock() isn't marked with 'notrace', it will call ftrace_push_return_trace() and cause infinite loop. The result of failure as follow: command: echo function_graph >current_tracer [ 46.176787] Unable to handle kernel paging request at virtual address ffffffe04fb38c48 [ 46.177309] Oops [#1] [ 46.177478] Modules linked in: [ 46.177770] CPU: 0 PID: 256 Comm: $d Not tainted 5.5.0-rc1 #47 [ 46.177981] epc: ffffffe00035e59a ra : ffffffe00035e57e sp : ffffffe03a7569b0 [ 46.178216] gp : ffffffe000d29b90 tp : ffffffe03a756180 t0 : ffffffe03a756968 [ 46.178430] t1 : ffffffe00087f408 t2 : ffffffe03a7569a0 s0 : ffffffe03a7569f0 [ 46.178643] s1 : ffffffe00087f408 a0 : 0000000ac054cda4 a1 : 000000000087f411 [ 46.178856] a2 : 0000000ac054cda4 a3 : 0000000000373ca0 a4 : ffffffe04fb38c48 [ 46.179099] a5 : 00000000153e22a8 a6 : 00000000005522ff a7 : 0000000000000005 [ 46.179338] s2 : ffffffe03a756a90 s3 : ffffffe00032811c s4 : ffffffe03a756a58 [ 46.179570] s5 : ffffffe000d29fe0 s6 : 0000000000000001 s7 : 0000000000000003 [ 46.179809] s8 : 0000000000000003 s9 : 0000000000000002 s10: 0000000000000004 [ 46.180053] s11: 0000000000000000 t3 : 0000003fc815749c t4 : 00000000000efc90 [ 46.180293] t5 : ffffffe000d29658 t6 : 0000000000040000 [ 46.180482] status: 0000000000000100 badaddr: ffffffe04fb38c48 cause: 000000000000000f Signed-off-by: Zong Li Reviewed-by: Steven Rostedt (VMware) [paul.walmsley@sifive.com: cleaned up patch description] Fixes: 92e0d143fdef ("clocksource/drivers/riscv_timer: Provide the sched_clock") Cc: stable@vger.kernel.org Signed-off-by: Paul Walmsley --- drivers/clocksource/timer-riscv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clocksource/timer-riscv.c b/drivers/clocksource/timer-riscv.c index 4e54856ce2a5..c4f15c4068c0 100644 --- a/drivers/clocksource/timer-riscv.c +++ b/drivers/clocksource/timer-riscv.c @@ -56,7 +56,7 @@ static unsigned long long riscv_clocksource_rdtime(struct clocksource *cs) return get_cycles64(); } -static u64 riscv_sched_clock(void) +static u64 notrace riscv_sched_clock(void) { return get_cycles64(); } From 2f3035da4019780250658d1ffe486bc324e04805 Mon Sep 17 00:00:00 2001 From: Paul Walmsley Date: Fri, 20 Dec 2019 03:09:49 -0800 Subject: [PATCH 144/146] riscv: prefix IRQ_ macro names with an RV_ namespace "IRQ_TIMER", used in the arch/riscv CSR header file, is a sufficiently generic macro name that it's used by several source files across the Linux code base. Some of these other files ultimately include the arch/riscv CSR include file, causing collisions. Fix by prefixing the RISC-V csr.h IRQ_ macro names with an RV_ prefix. Fixes: a4c3733d32a72 ("riscv: abstract out CSR names for supervisor vs machine mode") Reported-by: Olof Johansson Acked-by: Olof Johansson Signed-off-by: Paul Walmsley --- arch/riscv/include/asm/csr.h | 18 +++++++++--------- arch/riscv/kernel/irq.c | 6 +++--- drivers/irqchip/irq-sifive-plic.c | 2 +- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h index 0a62d2d68455..435b65532e29 100644 --- a/arch/riscv/include/asm/csr.h +++ b/arch/riscv/include/asm/csr.h @@ -116,9 +116,9 @@ # define SR_PIE SR_MPIE # define SR_PP SR_MPP -# define IRQ_SOFT IRQ_M_SOFT -# define IRQ_TIMER IRQ_M_TIMER -# define IRQ_EXT IRQ_M_EXT +# define RV_IRQ_SOFT IRQ_M_SOFT +# define RV_IRQ_TIMER IRQ_M_TIMER +# define RV_IRQ_EXT IRQ_M_EXT #else /* CONFIG_RISCV_M_MODE */ # define CSR_STATUS CSR_SSTATUS # define CSR_IE CSR_SIE @@ -133,15 +133,15 @@ # define SR_PIE SR_SPIE # define SR_PP SR_SPP -# define IRQ_SOFT IRQ_S_SOFT -# define IRQ_TIMER IRQ_S_TIMER -# define IRQ_EXT IRQ_S_EXT +# define RV_IRQ_SOFT IRQ_S_SOFT +# define RV_IRQ_TIMER IRQ_S_TIMER +# define RV_IRQ_EXT IRQ_S_EXT #endif /* CONFIG_RISCV_M_MODE */ /* IE/IP (Supervisor/Machine Interrupt Enable/Pending) flags */ -#define IE_SIE (_AC(0x1, UL) << IRQ_SOFT) -#define IE_TIE (_AC(0x1, UL) << IRQ_TIMER) -#define IE_EIE (_AC(0x1, UL) << IRQ_EXT) +#define IE_SIE (_AC(0x1, UL) << RV_IRQ_SOFT) +#define IE_TIE (_AC(0x1, UL) << RV_IRQ_TIMER) +#define IE_EIE (_AC(0x1, UL) << RV_IRQ_EXT) #ifndef __ASSEMBLY__ diff --git a/arch/riscv/kernel/irq.c b/arch/riscv/kernel/irq.c index 3f07a91d5afb..345c4f2eba13 100644 --- a/arch/riscv/kernel/irq.c +++ b/arch/riscv/kernel/irq.c @@ -23,11 +23,11 @@ asmlinkage __visible void __irq_entry do_IRQ(struct pt_regs *regs) irq_enter(); switch (regs->cause & ~CAUSE_IRQ_FLAG) { - case IRQ_TIMER: + case RV_IRQ_TIMER: riscv_timer_interrupt(); break; #ifdef CONFIG_SMP - case IRQ_SOFT: + case RV_IRQ_SOFT: /* * We only use software interrupts to pass IPIs, so if a non-SMP * system gets one, then we don't know what to do. @@ -35,7 +35,7 @@ asmlinkage __visible void __irq_entry do_IRQ(struct pt_regs *regs) riscv_software_interrupt(); break; #endif - case IRQ_EXT: + case RV_IRQ_EXT: handle_arch_irq(regs); break; default: diff --git a/drivers/irqchip/irq-sifive-plic.c b/drivers/irqchip/irq-sifive-plic.c index 8df547d2d935..0aca5807a119 100644 --- a/drivers/irqchip/irq-sifive-plic.c +++ b/drivers/irqchip/irq-sifive-plic.c @@ -256,7 +256,7 @@ static int __init plic_init(struct device_node *node, * Skip contexts other than external interrupts for our * privilege level. */ - if (parent.args[0] != IRQ_EXT) + if (parent.args[0] != RV_IRQ_EXT) continue; hartid = plic_find_hart_id(parent.np); From 0e194d9da198936fe4fb4c1e031de0f7791c09b8 Mon Sep 17 00:00:00 2001 From: Paul Walmsley Date: Fri, 22 Nov 2019 18:33:28 -0800 Subject: [PATCH 145/146] Documentation: riscv: add patch acceptance guidelines Formalize, in kernel documentation, the patch acceptance policy for arch/riscv. In summary, it states that as maintainers, we plan to only accept patches for new modules or extensions that have been frozen or ratified by the RISC-V Foundation. We've been following these guidelines for the past few months. In the meantime, we've received quite a bit of feedback that it would be helpful to have these guidelines formally documented. Based on a suggestion from Matthew Wilcox, we also add a link to this file to Documentation/process/index.rst, to make this document easier to find. The format of this document has also been changed to align to the format outlined in the maintainer entry profiles, in accordance with comments from Jon Corbet and Dan Williams. Signed-off-by: Paul Walmsley Reviewed-by: Palmer Dabbelt Cc: Palmer Dabbelt Cc: Albert Ou Cc: Krste Asanovic Cc: Andrew Waterman Cc: Matthew Wilcox Cc: Dan Williams Cc: Jonathan Corbet --- Documentation/process/index.rst | 1 + Documentation/riscv/index.rst | 1 + Documentation/riscv/patch-acceptance.rst | 35 ++++++++++++++++++++++++ MAINTAINERS | 1 + 4 files changed, 38 insertions(+) create mode 100644 Documentation/riscv/patch-acceptance.rst diff --git a/Documentation/process/index.rst b/Documentation/process/index.rst index 21aa7d5358e6..6399d92f0b21 100644 --- a/Documentation/process/index.rst +++ b/Documentation/process/index.rst @@ -60,6 +60,7 @@ lack of a better place. volatile-considered-harmful botching-up-ioctls clang-format + ../riscv/patch-acceptance .. only:: subproject and html diff --git a/Documentation/riscv/index.rst b/Documentation/riscv/index.rst index 215fd3c1f2d5..fa33bffd8992 100644 --- a/Documentation/riscv/index.rst +++ b/Documentation/riscv/index.rst @@ -7,6 +7,7 @@ RISC-V architecture boot-image-header pmu + patch-acceptance .. only:: subproject and html diff --git a/Documentation/riscv/patch-acceptance.rst b/Documentation/riscv/patch-acceptance.rst new file mode 100644 index 000000000000..dfe0ac5624fb --- /dev/null +++ b/Documentation/riscv/patch-acceptance.rst @@ -0,0 +1,35 @@ +.. SPDX-License-Identifier: GPL-2.0 + +arch/riscv maintenance guidelines for developers +================================================ + +Overview +-------- +The RISC-V instruction set architecture is developed in the open: +in-progress drafts are available for all to review and to experiment +with implementations. New module or extension drafts can change +during the development process - sometimes in ways that are +incompatible with previous drafts. This flexibility can present a +challenge for RISC-V Linux maintenance. Linux maintainers disapprove +of churn, and the Linux development process prefers well-reviewed and +tested code over experimental code. We wish to extend these same +principles to the RISC-V-related code that will be accepted for +inclusion in the kernel. + +Submit Checklist Addendum +------------------------- +We'll only accept patches for new modules or extensions if the +specifications for those modules or extensions are listed as being +"Frozen" or "Ratified" by the RISC-V Foundation. (Developers may, of +course, maintain their own Linux kernel trees that contain code for +any draft extensions that they wish.) + +Additionally, the RISC-V specification allows implementors to create +their own custom extensions. These custom extensions aren't required +to go through any review or ratification process by the RISC-V +Foundation. To avoid the maintenance complexity and potential +performance impact of adding kernel code for implementor-specific +RISC-V extensions, we'll only to accept patches for extensions that +have been officially frozen or ratified by the RISC-V Foundation. +(Implementors, may, of course, maintain their own Linux kernel trees +containing code for any custom extensions that they wish.) diff --git a/MAINTAINERS b/MAINTAINERS index e09bd92a1e44..2987d1e16d20 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14119,6 +14119,7 @@ M: Paul Walmsley M: Palmer Dabbelt M: Albert Ou L: linux-riscv@lists.infradead.org +P: Documentation/riscv/patch-acceptance.rst T: git git://git.kernel.org/pub/scm/linux/kernel/git/riscv/linux.git S: Supported F: arch/riscv/ From c79f46a282390e0f5b306007bf7b11a46d529538 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 5 Jan 2020 14:23:27 -0800 Subject: [PATCH 146/146] Linux 5.5-rc5 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index b99d95df8075..e4c2d0327d8c 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 5 PATCHLEVEL = 5 SUBLEVEL = 0 -EXTRAVERSION = -rc4 +EXTRAVERSION = -rc5 NAME = Kleptomaniac Octopus # *DOCUMENTATION*