From c0d2f4ce5c9fbc3e3b219d828b77ecd4445c1ad2 Mon Sep 17 00:00:00 2001 From: Costa Shulyupin Date: Fri, 25 Aug 2023 04:30:57 +0300 Subject: [PATCH 01/64] docs: fix link s390/zfcpdump.rst After move of Documentation/s390 to Documentation/arch/s390 Link: https://lkml.kernel.org/r/20230825013102.1487979-1-costa.shul@redhat.com Signed-off-by: Costa Shulyupin Cc: Baoquan He Cc: Eric DeVolder Cc: Hari Bathini Cc: Sourabh Jain Cc: Heiko Carstens Signed-off-by: Andrew Morton --- kernel/Kconfig.kexec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index 9bfe68fe9676..7aff28ded2f4 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -110,7 +110,7 @@ config CRASH_DUMP For more details see Documentation/admin-guide/kdump/kdump.rst For s390, this option also enables zfcpdump. - See also + See also config CRASH_HOTPLUG bool "Update the crash elfcorehdr on system configuration changes" From a8306f2d4dcea03538c70c26d2948483f70254ff Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Thu, 31 Aug 2023 09:33:40 -0700 Subject: [PATCH 02/64] compiler.h: unify __UNIQUE_ID commit 6f33d58794ef ("__UNIQUE_ID()") added a fallback definition of __UNIQUE_ID because gcc 4.2 and older did not support __COUNTER__. Also, this commit is effectively a revert of commit b41c29b0527c ("Kbuild: provide a __UNIQUE_ID for clang") which mentions clang 2.6+ supporting __COUNTER__. Documentation/process/changes.rst currently lists the minimum supported version of these compilers as: - gcc: 5.1 - clang: 11.0.0 It should be safe to say that __COUNTER__ is well supported by this point. Link: https://lkml.kernel.org/r/20230831-unique_id-v1-1-28bacd18eb1d@google.com Signed-off-by: Nick Desaulniers Cc: Arnd Bergmann Cc: Jan Beulich Cc: Luc Van Oostenryck Cc: Michal rarek Cc: Nathan Chancellor Cc: Paul Russel Cc: Tom Rix Signed-off-by: Andrew Morton --- include/linux/compiler-clang.h | 5 ----- include/linux/compiler-gcc.h | 2 -- include/linux/compiler.h | 5 +---- 3 files changed, 1 insertion(+), 11 deletions(-) diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index 9b673fefcef8..ddab1ef22bee 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -14,11 +14,6 @@ #undef __cleanup #define __cleanup(func) __maybe_unused __attribute__((__cleanup__(func))) -/* same as gcc, this was present in clang-2.6 so we can assume it works - * with any version that can compile the kernel - */ -#define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__) - /* all clang versions usable with the kernel support KASAN ABI version 5 */ #define KASAN_ABI_VERSION 5 diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 7af9e34ec261..2ceba3fe4ec1 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -39,8 +39,6 @@ #define __noretpoline __attribute__((__indirect_branch__("keep"))) #endif -#define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__) - #if defined(LATENT_ENTROPY_PLUGIN) && !defined(__CHECKER__) #define __latent_entropy __attribute__((latent_entropy)) #endif diff --git a/include/linux/compiler.h b/include/linux/compiler.h index d7779a18b24f..174099fdc485 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -177,10 +177,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, __asm__ ("" : "=r" (var) : "0" (var)) #endif -/* Not-quite-unique ID. */ -#ifndef __UNIQUE_ID -# define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __LINE__) -#endif +#define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__) /** * data_race - mark an expression as containing intentional data races From 00adf323b2e7c973a7129b906c381ed377aec1a5 Mon Sep 17 00:00:00 2001 From: Yuanheng Zhang Date: Mon, 28 Aug 2023 13:17:41 +0800 Subject: [PATCH 03/64] ocfs2: correct range->len in ocfs2_trim_fs() global bitmap is a cluster allocator,so after we traverse the global bitmap and finished the fstrim,the trimmed range should be 'trimmed * clustersize'.otherwise,the trimmed range printed by 'fstrim -v' is not as expected. Link: https://lkml.kernel.org/r/20230828051741.204577-1-yuanhengzhang1214@gmail.com Signed-off-by: Yuanheng Zhang Reviewed-by: Joseph Qi Cc: Joel Becker Cc: Mark Fasheh Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index aef58f1395c8..0fb5e3a875d2 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -7642,7 +7642,7 @@ out_mutex: goto next_group; } out: - range->len = trimmed * sb->s_blocksize; + range->len = trimmed * osb->s_clustersize; return ret; } From 33a9813825710fdc2b980d566ee391fd093a36c6 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 24 Aug 2023 16:31:42 +0200 Subject: [PATCH 04/64] introduce __next_thread(), fix next_tid() vs exec() race Patch series "introduce __next_thread(), change next_thread()". After commit dce8f8ed1de1 ("document while_each_thread(), change first_tid() to use for_each_thread()") + this series 1. We have only one lockless user of next_thread(), task_group_seq_get_next(). I think it should be changed too. 2. We have only one user of task_struct->thread_group, thread_group_empty(). The next patches will change thread_group_empty() and kill ->thread_group. This patch (of 2): next_tid(start) does: rcu_read_lock(); if (pid_alive(start)) { pos = next_thread(start); if (thread_group_leader(pos)) pos = NULL; else get_task_struct(pos); it should return pos = NULL when next_thread() wraps to the 1st thread in the thread group, group leader, and the thread_group_leader() check tries to detect this case. But this can race with exec. To simplify, suppose we have a main thread M and a single sub-thread T, next_tid(T) should return NULL. Now suppose that T execs. If next_tid(T) is called after T changes the leadership and before it does release_task() which removes the old leader from list, then next_thread() returns M and thread_group_leader(M) = F. Lockless use of next_thread() should be avoided. After this change only task_group_seq_get_next() does this, and I believe it should be changed as well. Link: https://lkml.kernel.org/r/20230824143112.GA31208@redhat.com Link: https://lkml.kernel.org/r/20230824143142.GA31222@redhat.com Signed-off-by: Oleg Nesterov Cc: Eric W. Biederman Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- fs/proc/base.c | 6 ++---- include/linux/sched/signal.h | 11 +++++++++++ 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index ffd54617c354..c0e971cc6d41 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3840,10 +3840,8 @@ static struct task_struct *next_tid(struct task_struct *start) struct task_struct *pos = NULL; rcu_read_lock(); if (pid_alive(start)) { - pos = next_thread(start); - if (thread_group_leader(pos)) - pos = NULL; - else + pos = __next_thread(start); + if (pos) get_task_struct(pos); } rcu_read_unlock(); diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 0014d3adaf84..7fb34b8cda54 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -715,6 +715,17 @@ bool same_thread_group(struct task_struct *p1, struct task_struct *p2) return p1->signal == p2->signal; } +/* + * returns NULL if p is the last thread in the thread group + */ +static inline struct task_struct *__next_thread(struct task_struct *p) +{ + return list_next_or_null_rcu(&p->signal->thread_head, + &p->thread_node, + struct task_struct, + thread_node); +} + static inline struct task_struct *next_thread(const struct task_struct *p) { return list_entry_rcu(p->thread_group.next, From d639cf4abb4d171ab2456904da5668c42b5c1937 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 24 Aug 2023 16:32:01 +0200 Subject: [PATCH 05/64] change next_thread() to use __next_thread() ?: group_leader This relies on fact that group leader is always the 1st entry in the signal->thread_head list. With or without this change, if the lockless next_thread(last_thread) races with exec it can return the old or the new leader. We are almost ready to kill task->thread_group, after this change its only user is thread_group_empty(). Link: https://lkml.kernel.org/r/20230824143201.GB31222@redhat.com Signed-off-by: Oleg Nesterov Cc: Eric W. Biederman Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- include/linux/sched/signal.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 7fb34b8cda54..cffc882d367f 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -726,10 +726,9 @@ static inline struct task_struct *__next_thread(struct task_struct *p) thread_node); } -static inline struct task_struct *next_thread(const struct task_struct *p) +static inline struct task_struct *next_thread(struct task_struct *p) { - return list_entry_rcu(p->thread_group.next, - struct task_struct, thread_group); + return __next_thread(p) ?: p->group_leader; } static inline int thread_group_empty(struct task_struct *p) From e34a35ee1f52312af130b5ebd42fa28313fc6149 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 26 Aug 2023 13:14:06 +0200 Subject: [PATCH 06/64] change thread_group_empty() to use task_struct->thread_node Patch series "kill task_struct->thread_group". This patch (of 2): It could use list_is_singular() but this way it is cheaper. Plus the thread_group_leader() check makes it clear that thread_group_empty() can only return true if p is a group leader. This was not immediately obvious before this patch. task_struct->thread_group no longer has users, it can die. Link: https://lkml.kernel.org/r/20230826111200.GA22982@redhat.com Link: https://lkml.kernel.org/r/20230826111406.GA23238@redhat.com Signed-off-by: Oleg Nesterov Cc: Eric W. Biederman Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- include/linux/sched/signal.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index cffc882d367f..d7fa3ca2fa53 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -733,7 +733,8 @@ static inline struct task_struct *next_thread(struct task_struct *p) static inline int thread_group_empty(struct task_struct *p) { - return list_empty(&p->thread_group); + return thread_group_leader(p) && + list_is_last(&p->thread_node, &p->signal->thread_head); } #define delay_group_leader(p) \ From 8e1f385104ac044f1552686ad6e1cbc71cc05a30 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 26 Aug 2023 13:14:09 +0200 Subject: [PATCH 07/64] kill task_struct->thread_group The last user was removed by the previous patch. Link: https://lkml.kernel.org/r/20230826111409.GA23243@redhat.com Signed-off-by: Oleg Nesterov Cc: Eric W. Biederman Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- include/linux/sched.h | 1 - init/init_task.c | 1 - kernel/exit.c | 1 - kernel/fork.c | 3 --- 4 files changed, 6 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 77f01ac385f7..6d1341b1673f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1002,7 +1002,6 @@ struct task_struct { /* PID/PID hash table linkage. */ struct pid *thread_pid; struct hlist_node pid_links[PIDTYPE_MAX]; - struct list_head thread_group; struct list_head thread_node; struct completion *vfork_done; diff --git a/init/init_task.c b/init/init_task.c index ff6c4b9bfe6b..c0de0200fd56 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -132,7 +132,6 @@ struct task_struct init_task .pi_lock = __RAW_SPIN_LOCK_UNLOCKED(init_task.pi_lock), .timer_slack_ns = 50000, /* 50 usec default slack */ .thread_pid = &init_struct_pid, - .thread_group = LIST_HEAD_INIT(init_task.thread_group), .thread_node = LIST_HEAD_INIT(init_signals.thread_head), #ifdef CONFIG_AUDIT .loginuid = INVALID_UID, diff --git a/kernel/exit.c b/kernel/exit.c index edb50b4c9972..f3ba4b97a7d9 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -133,7 +133,6 @@ static void __unhash_process(struct task_struct *p, bool group_dead) list_del_init(&p->sibling); __this_cpu_dec(process_counts); } - list_del_rcu(&p->thread_group); list_del_rcu(&p->thread_node); } diff --git a/kernel/fork.c b/kernel/fork.c index 3b6d20dfb9a8..b9d3aa493bbd 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2576,7 +2576,6 @@ __latent_entropy struct task_struct *copy_process( p->dirty_paused_when = 0; p->pdeath_signal = 0; - INIT_LIST_HEAD(&p->thread_group); p->task_works = NULL; clear_posix_cputimers_work(p); @@ -2704,8 +2703,6 @@ __latent_entropy struct task_struct *copy_process( atomic_inc(¤t->signal->live); refcount_inc(¤t->signal->sigcnt); task_join_group_stop(p); - list_add_tail_rcu(&p->thread_group, - &p->group_leader->thread_group); list_add_tail_rcu(&p->thread_node, &p->signal->thread_head); } From 398352049146e34ec6113a00c63457149a81345c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 23 Aug 2023 19:14:55 +0200 Subject: [PATCH 08/64] __kill_pgrp_info: simplify the calculation of return value No need to calculate/check the "success" variable, we can kill it and update retval in the main loop unless it is zero. Link: https://lkml.kernel.org/r/20230823171455.GA12188@redhat.com Signed-off-by: Oleg Nesterov Suggested-by: David Laight Cc: Eric W. Biederman Signed-off-by: Andrew Morton --- kernel/signal.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/kernel/signal.c b/kernel/signal.c index 09019017d669..fc276fc07d87 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1471,16 +1471,21 @@ int group_send_sig_info(int sig, struct kernel_siginfo *info, int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp) { struct task_struct *p = NULL; - int retval, success; + int ret = -ESRCH; - success = 0; - retval = -ESRCH; do_each_pid_task(pgrp, PIDTYPE_PGID, p) { int err = group_send_sig_info(sig, info, p, PIDTYPE_PGID); - success |= !err; - retval = err; + /* + * If group_send_sig_info() succeeds at least once ret + * becomes 0 and after that the code below has no effect. + * Otherwise we return the last err or -ESRCH if this + * process group is empty. + */ + if (ret) + ret = err; } while_each_pid_task(pgrp, PIDTYPE_PGID, p); - return success ? 0 : retval; + + return ret; } int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid) From 9734fe4dc22052d8010cc0b68b092fe5335ccb31 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 4 Sep 2023 17:21:01 +0200 Subject: [PATCH 09/64] panic: use atomic_try_cmpxchg in panic() and nmi_panic() Use atomic_try_cmpxchg instead of atomic_cmpxchg (*ptr, old, new) == old in panic() and nmi_panic(). x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after cmpxchg (and related move instruction in front of cmpxchg). Also, rename cpu variable to this_cpu in nmi_panic() and try to unify logic flow between panic() and nmi_panic(). No functional change intended. [ubizjak@gmail.com: clean up if/else block] Link: https://lkml.kernel.org/r/20230906191200.68707-1-ubizjak@gmail.com Link: https://lkml.kernel.org/r/20230904152230.9227-1-ubizjak@gmail.com Signed-off-by: Uros Bizjak Cc: Mark Rutland Signed-off-by: Andrew Morton --- kernel/panic.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/kernel/panic.c b/kernel/panic.c index ffa037fa777d..2807639aab51 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -192,14 +192,15 @@ atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID); */ void nmi_panic(struct pt_regs *regs, const char *msg) { - int old_cpu, cpu; + int old_cpu, this_cpu; - cpu = raw_smp_processor_id(); - old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, cpu); + old_cpu = PANIC_CPU_INVALID; + this_cpu = raw_smp_processor_id(); - if (old_cpu == PANIC_CPU_INVALID) + /* atomic_try_cmpxchg updates old_cpu on failure */ + if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) panic("%s", msg); - else if (old_cpu != cpu) + else if (old_cpu != this_cpu) nmi_panic_self_stop(regs); } EXPORT_SYMBOL(nmi_panic); @@ -311,15 +312,18 @@ void panic(const char *fmt, ...) * stop themself or will wait until they are stopped by the 1st CPU * with smp_send_stop(). * - * `old_cpu == PANIC_CPU_INVALID' means this is the 1st CPU which - * comes here, so go ahead. + * cmpxchg success means this is the 1st CPU which comes here, + * so go ahead. * `old_cpu == this_cpu' means we came from nmi_panic() which sets * panic_cpu to this CPU. In this case, this is also the 1st CPU. */ + old_cpu = PANIC_CPU_INVALID; this_cpu = raw_smp_processor_id(); - old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu); - if (old_cpu != PANIC_CPU_INVALID && old_cpu != this_cpu) + /* atomic_try_cmpxchg updates old_cpu on failure */ + if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) { + /* go ahead */ + } else if (old_cpu != this_cpu) panic_smp_self_stop(); console_verbose(); From 9cba82bba500e3ce875381350f289cfb3aa633ba Mon Sep 17 00:00:00 2001 From: Xingui Yang Date: Tue, 5 Sep 2023 02:48:33 +0000 Subject: [PATCH 10/64] seq_file: add helper macro to define attribute for rw file Patch series "Add helper macro DEFINE_SHOW_STORE_ATTRIBUTE() at seq_file.c", v6. We already own DEFINE_SHOW_ATTRIBUTE() helper macro for defining attribute for read-only file, but we found many of drivers also want a helper macro for read-write file too. So we add this helper macro to reduce duplicated code. This patch (of 3): We already own DEFINE_SHOW_ATTRIBUTE() helper macro for defining attribute for read-only file, but many of drivers want a helper macro for read-write file too. So we add DEFINE_SHOW_STORE_ATTRIBUTE() helper to reduce duplicated code. Link: https://lkml.kernel.org/r/20230905024835.43219-1-yangxingui@huawei.com Link: https://lkml.kernel.org/r/20230905024835.43219-2-yangxingui@huawei.com Signed-off-by: Luo Jiaxing Co-developed-by: Xingui Yang Signed-off-by: Xingui Yang Reviewed-by: Andy Shevchenko Cc: Al Viro Cc: Animesh Manna Cc: Anshuman Gupta Cc: Damien Le Moal Cc: Felipe Balbi Cc: Greg Kroah-Hartman Cc: Himanshu Madhani Cc: James Bottomley Cc: John Garry Cc: Martin K. Petersen Cc: Uma Shankar Cc: Xiang Chen Cc: Zeng Tao Signed-off-by: Andrew Morton --- include/linux/seq_file.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index 386ab580b839..234bcdb1fba4 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -207,6 +207,21 @@ static const struct file_operations __name ## _fops = { \ .release = single_release, \ } +#define DEFINE_SHOW_STORE_ATTRIBUTE(__name) \ +static int __name ## _open(struct inode *inode, struct file *file) \ +{ \ + return single_open(file, __name ## _show, inode->i_private); \ +} \ + \ +static const struct file_operations __name ## _fops = { \ + .owner = THIS_MODULE, \ + .open = __name ## _open, \ + .read = seq_read, \ + .write = __name ## _write, \ + .llseek = seq_lseek, \ + .release = single_release, \ +} + #define DEFINE_PROC_SHOW_ATTRIBUTE(__name) \ static int __name ## _open(struct inode *inode, struct file *file) \ { \ From 00c9d55f57d53a12866cbf5e5aee9a9a0e572b19 Mon Sep 17 00:00:00 2001 From: Xingui Yang Date: Tue, 5 Sep 2023 02:48:34 +0000 Subject: [PATCH 11/64] scsi: hisi_sas: use DEFINE_SHOW_STORE_ATTRIBUTE() helper for debugfs Use DEFINE_SHOW_STORE_ATTRIBUTE() helper for read-write file to reduce some duplicated code. Link: https://lkml.kernel.org/r/20230905024835.43219-3-yangxingui@huawei.com Signed-off-by: Luo Jiaxing Co-developed-by: Xingui Yang Signed-off-by: Xingui Yang Reviewed-by: Andy Shevchenko Cc: Al Viro Cc: Animesh Manna Cc: Anshuman Gupta Cc: Damien Le Moal Cc: Felipe Balbi Cc: Greg Kroah-Hartman Cc: Himanshu Madhani Cc: James Bottomley Cc: John Garry Cc: Martin K. Petersen Cc: Uma Shankar Cc: Xiang Chen Cc: Zeng Tao Signed-off-by: Andrew Morton --- drivers/scsi/hisi_sas/hisi_sas_v3_hw.c | 137 ++----------------------- 1 file changed, 9 insertions(+), 128 deletions(-) diff --git a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c index bbb64ee6afd7..5bb35c3ea4e5 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c +++ b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c @@ -3990,22 +3990,7 @@ static ssize_t debugfs_bist_linkrate_v3_hw_write(struct file *filp, return count; } - -static int debugfs_bist_linkrate_v3_hw_open(struct inode *inode, - struct file *filp) -{ - return single_open(filp, debugfs_bist_linkrate_v3_hw_show, - inode->i_private); -} - -static const struct file_operations debugfs_bist_linkrate_v3_hw_fops = { - .open = debugfs_bist_linkrate_v3_hw_open, - .read = seq_read, - .write = debugfs_bist_linkrate_v3_hw_write, - .llseek = seq_lseek, - .release = single_release, - .owner = THIS_MODULE, -}; +DEFINE_SHOW_STORE_ATTRIBUTE(debugfs_bist_linkrate_v3_hw); static const struct { int value; @@ -4080,22 +4065,7 @@ static ssize_t debugfs_bist_code_mode_v3_hw_write(struct file *filp, return count; } - -static int debugfs_bist_code_mode_v3_hw_open(struct inode *inode, - struct file *filp) -{ - return single_open(filp, debugfs_bist_code_mode_v3_hw_show, - inode->i_private); -} - -static const struct file_operations debugfs_bist_code_mode_v3_hw_fops = { - .open = debugfs_bist_code_mode_v3_hw_open, - .read = seq_read, - .write = debugfs_bist_code_mode_v3_hw_write, - .llseek = seq_lseek, - .release = single_release, - .owner = THIS_MODULE, -}; +DEFINE_SHOW_STORE_ATTRIBUTE(debugfs_bist_code_mode_v3_hw); static ssize_t debugfs_bist_phy_v3_hw_write(struct file *filp, const char __user *buf, @@ -4129,22 +4099,7 @@ static int debugfs_bist_phy_v3_hw_show(struct seq_file *s, void *p) return 0; } - -static int debugfs_bist_phy_v3_hw_open(struct inode *inode, - struct file *filp) -{ - return single_open(filp, debugfs_bist_phy_v3_hw_show, - inode->i_private); -} - -static const struct file_operations debugfs_bist_phy_v3_hw_fops = { - .open = debugfs_bist_phy_v3_hw_open, - .read = seq_read, - .write = debugfs_bist_phy_v3_hw_write, - .llseek = seq_lseek, - .release = single_release, - .owner = THIS_MODULE, -}; +DEFINE_SHOW_STORE_ATTRIBUTE(debugfs_bist_phy_v3_hw); static ssize_t debugfs_bist_cnt_v3_hw_write(struct file *filp, const char __user *buf, @@ -4177,22 +4132,7 @@ static int debugfs_bist_cnt_v3_hw_show(struct seq_file *s, void *p) return 0; } - -static int debugfs_bist_cnt_v3_hw_open(struct inode *inode, - struct file *filp) -{ - return single_open(filp, debugfs_bist_cnt_v3_hw_show, - inode->i_private); -} - -static const struct file_operations debugfs_bist_cnt_v3_hw_ops = { - .open = debugfs_bist_cnt_v3_hw_open, - .read = seq_read, - .write = debugfs_bist_cnt_v3_hw_write, - .llseek = seq_lseek, - .release = single_release, - .owner = THIS_MODULE, -}; +DEFINE_SHOW_STORE_ATTRIBUTE(debugfs_bist_cnt_v3_hw); static const struct { int value; @@ -4256,22 +4196,7 @@ static ssize_t debugfs_bist_mode_v3_hw_write(struct file *filp, return count; } - -static int debugfs_bist_mode_v3_hw_open(struct inode *inode, - struct file *filp) -{ - return single_open(filp, debugfs_bist_mode_v3_hw_show, - inode->i_private); -} - -static const struct file_operations debugfs_bist_mode_v3_hw_fops = { - .open = debugfs_bist_mode_v3_hw_open, - .read = seq_read, - .write = debugfs_bist_mode_v3_hw_write, - .llseek = seq_lseek, - .release = single_release, - .owner = THIS_MODULE, -}; +DEFINE_SHOW_STORE_ATTRIBUTE(debugfs_bist_mode_v3_hw); static ssize_t debugfs_bist_enable_v3_hw_write(struct file *filp, const char __user *buf, @@ -4309,22 +4234,7 @@ static int debugfs_bist_enable_v3_hw_show(struct seq_file *s, void *p) return 0; } - -static int debugfs_bist_enable_v3_hw_open(struct inode *inode, - struct file *filp) -{ - return single_open(filp, debugfs_bist_enable_v3_hw_show, - inode->i_private); -} - -static const struct file_operations debugfs_bist_enable_v3_hw_fops = { - .open = debugfs_bist_enable_v3_hw_open, - .read = seq_read, - .write = debugfs_bist_enable_v3_hw_write, - .llseek = seq_lseek, - .release = single_release, - .owner = THIS_MODULE, -}; +DEFINE_SHOW_STORE_ATTRIBUTE(debugfs_bist_enable_v3_hw); static const struct { char *name; @@ -4362,21 +4272,7 @@ static int debugfs_v3_hw_show(struct seq_file *s, void *p) return 0; } - -static int debugfs_v3_hw_open(struct inode *inode, struct file *filp) -{ - return single_open(filp, debugfs_v3_hw_show, - inode->i_private); -} - -static const struct file_operations debugfs_v3_hw_fops = { - .open = debugfs_v3_hw_open, - .read = seq_read, - .write = debugfs_v3_hw_write, - .llseek = seq_lseek, - .release = single_release, - .owner = THIS_MODULE, -}; +DEFINE_SHOW_STORE_ATTRIBUTE(debugfs_v3_hw); static ssize_t debugfs_phy_down_cnt_v3_hw_write(struct file *filp, const char __user *buf, @@ -4407,22 +4303,7 @@ static int debugfs_phy_down_cnt_v3_hw_show(struct seq_file *s, void *p) return 0; } - -static int debugfs_phy_down_cnt_v3_hw_open(struct inode *inode, - struct file *filp) -{ - return single_open(filp, debugfs_phy_down_cnt_v3_hw_show, - inode->i_private); -} - -static const struct file_operations debugfs_phy_down_cnt_v3_hw_fops = { - .open = debugfs_phy_down_cnt_v3_hw_open, - .read = seq_read, - .write = debugfs_phy_down_cnt_v3_hw_write, - .llseek = seq_lseek, - .release = single_release, - .owner = THIS_MODULE, -}; +DEFINE_SHOW_STORE_ATTRIBUTE(debugfs_phy_down_cnt_v3_hw); enum fifo_dump_mode_v3_hw { FIFO_DUMP_FORVER = (1U << 0), @@ -4832,7 +4713,7 @@ static void debugfs_bist_init_v3_hw(struct hisi_hba *hisi_hba) hisi_hba, &debugfs_bist_phy_v3_hw_fops); debugfs_create_file("cnt", 0600, hisi_hba->debugfs_bist_dentry, - hisi_hba, &debugfs_bist_cnt_v3_hw_ops); + hisi_hba, &debugfs_bist_cnt_v3_hw_fops); debugfs_create_file("loopback_mode", 0600, hisi_hba->debugfs_bist_dentry, From a9d56ce053dacf6e4171c984e2c6e689f10e94bd Mon Sep 17 00:00:00 2001 From: Xingui Yang Date: Tue, 5 Sep 2023 02:48:35 +0000 Subject: [PATCH 12/64] scsi: qla2xxx: use DEFINE_SHOW_STORE_ATTRIBUTE() helper for debugfs Use DEFINE_SHOW_STORE_ATTRIBUTE() helper for read-write file to reduce some duplicated code. Link: https://lkml.kernel.org/r/20230905024835.43219-4-yangxingui@huawei.com Signed-off-by: Luo Jiaxing Co-developed-by: Xingui Yang Signed-off-by: Xingui Yang Reviewed-by: Andy Shevchenko Cc: Al Viro Cc: Animesh Manna Cc: Anshuman Gupta Cc: Damien Le Moal Cc: Felipe Balbi Cc: Greg Kroah-Hartman Cc: Himanshu Madhani Cc: James Bottomley Cc: John Garry Cc: Martin K. Petersen Cc: Uma Shankar Cc: Xiang Chen Cc: Zeng Tao Signed-off-by: Andrew Morton --- drivers/scsi/qla2xxx/qla_dfs.c | 59 ++++------------------------------ 1 file changed, 7 insertions(+), 52 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_dfs.c b/drivers/scsi/qla2xxx/qla_dfs.c index a7a364760b80..55ff3d7482b3 100644 --- a/drivers/scsi/qla2xxx/qla_dfs.c +++ b/drivers/scsi/qla2xxx/qla_dfs.c @@ -528,51 +528,22 @@ qla_dfs_naqp_show(struct seq_file *s, void *unused) * * Example for creating "TEST" sysfs file: * 1. struct qla_hw_data { ... struct dentry *dfs_TEST; } - * 2. QLA_DFS_SETUP_RD(TEST, scsi_qla_host_t); + * 2. QLA_DFS_SETUP_RD(TEST); * 3. In qla2x00_dfs_setup(): * QLA_DFS_CREATE_FILE(ha, TEST, 0600, ha->dfs_dir, vha); * 4. In qla2x00_dfs_remove(): * QLA_DFS_REMOVE_FILE(ha, TEST); */ -#define QLA_DFS_SETUP_RD(_name, _ctx_struct) \ -static int \ -qla_dfs_##_name##_open(struct inode *inode, struct file *file) \ -{ \ - _ctx_struct *__ctx = inode->i_private; \ - \ - return single_open(file, qla_dfs_##_name##_show, __ctx); \ -} \ - \ -static const struct file_operations qla_dfs_##_name##_ops = { \ - .open = qla_dfs_##_name##_open, \ - .read = seq_read, \ - .llseek = seq_lseek, \ - .release = single_release, \ -}; +#define QLA_DFS_SETUP_RD(_name) DEFINE_SHOW_ATTRIBUTE(qla_dfs_##_name) -#define QLA_DFS_SETUP_RW(_name, _ctx_struct) \ -static int \ -qla_dfs_##_name##_open(struct inode *inode, struct file *file) \ -{ \ - _ctx_struct *__ctx = inode->i_private; \ - \ - return single_open(file, qla_dfs_##_name##_show, __ctx); \ -} \ - \ -static const struct file_operations qla_dfs_##_name##_ops = { \ - .open = qla_dfs_##_name##_open, \ - .read = seq_read, \ - .llseek = seq_lseek, \ - .release = single_release, \ - .write = qla_dfs_##_name##_write, \ -}; +#define QLA_DFS_SETUP_RW(_name) DEFINE_SHOW_STORE_ATTRIBUTE(qla_dfs_##_name) #define QLA_DFS_ROOT_CREATE_FILE(_name, _perm, _ctx) \ do { \ if (!qla_dfs_##_name) \ qla_dfs_##_name = debugfs_create_file(#_name, \ _perm, qla2x00_dfs_root, _ctx, \ - &qla_dfs_##_name##_ops); \ + &qla_dfs_##_name##_fops); \ } while (0) #define QLA_DFS_ROOT_REMOVE_FILE(_name) \ @@ -587,7 +558,7 @@ static const struct file_operations qla_dfs_##_name##_ops = { \ do { \ (_struct)->dfs_##_name = debugfs_create_file(#_name, \ _perm, _parent, _ctx, \ - &qla_dfs_##_name##_ops) \ + &qla_dfs_##_name##_fops) \ } while (0) #define QLA_DFS_REMOVE_FILE(_struct, _name) \ @@ -598,14 +569,6 @@ static const struct file_operations qla_dfs_##_name##_ops = { \ } \ } while (0) -static int -qla_dfs_naqp_open(struct inode *inode, struct file *file) -{ - struct scsi_qla_host *vha = inode->i_private; - - return single_open(file, qla_dfs_naqp_show, vha); -} - static ssize_t qla_dfs_naqp_write(struct file *file, const char __user *buffer, size_t count, loff_t *pos) @@ -653,15 +616,7 @@ out_free: kfree(buf); return rc; } - -static const struct file_operations dfs_naqp_ops = { - .open = qla_dfs_naqp_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .write = qla_dfs_naqp_write, -}; - +QLA_DFS_SETUP_RW(naqp); int qla2x00_dfs_setup(scsi_qla_host_t *vha) @@ -707,7 +662,7 @@ create_nodes: if (IS_QLA27XX(ha) || IS_QLA83XX(ha) || IS_QLA28XX(ha)) { ha->tgt.dfs_naqp = debugfs_create_file("naqp", - 0400, ha->dfs_dir, vha, &dfs_naqp_ops); + 0400, ha->dfs_dir, vha, &qla_dfs_naqp_fops); if (IS_ERR(ha->tgt.dfs_naqp)) { ql_log(ql_log_warn, vha, 0xd011, "Unable to create debugFS naqp node.\n"); From 7904e53ed5a20fc678c01d5d1b07ec486425bb6a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 9 Sep 2023 18:45:01 +0200 Subject: [PATCH 13/64] fs/proc: do_task_stat: use __for_each_thread() do/while_each_thread should be avoided when possible. Link: https://lkml.kernel.org/r/20230909164501.GA11581@redhat.com Signed-off-by: Oleg Nesterov Cc: Eric W. Biederman Signed-off-by: Andrew Morton --- fs/proc/array.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/proc/array.c b/fs/proc/array.c index 2c2efbe685d8..ff08a8957552 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -536,12 +536,13 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, /* add up live thread stats at the group level */ if (whole) { - struct task_struct *t = task; - do { + struct task_struct *t; + + __for_each_thread(sig, t) { min_flt += t->min_flt; maj_flt += t->maj_flt; gtime += task_gtime(t); - } while_each_thread(task, t); + } min_flt += sig->min_flt; maj_flt += sig->maj_flt; From e5ecf29c507830f192d3eb255662ad9ba219c442 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 9 Sep 2023 18:45:37 +0200 Subject: [PATCH 14/64] signal: complete_signal: use __for_each_thread() do/while_each_thread should be avoided when possible. Link: https://lkml.kernel.org/r/20230909164537.GA11633@redhat.com Signed-off-by: Oleg Nesterov Cc: Eric W. Biederman Signed-off-by: Andrew Morton --- kernel/signal.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/signal.c b/kernel/signal.c index fc276fc07d87..ccfc3ded5672 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1058,12 +1058,11 @@ static void complete_signal(int sig, struct task_struct *p, enum pid_type type) signal->flags = SIGNAL_GROUP_EXIT; signal->group_exit_code = sig; signal->group_stop_count = 0; - t = p; - do { + __for_each_thread(signal, t) { task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); sigaddset(&t->pending.signal, SIGKILL); signal_wake_up(t, 1); - } while_each_thread(p, t); + } return; } } From c7ac8231ace9b07306d0299969e42073b189c70a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 9 Sep 2023 19:25:54 +0200 Subject: [PATCH 15/64] getrusage: add the "signal_struct *sig" local variable No functional changes, cleanup/preparation. Link: https://lkml.kernel.org/r/20230909172554.GA20441@redhat.com Signed-off-by: Oleg Nesterov Cc: Eric W. Biederman Signed-off-by: Andrew Morton --- kernel/sys.c | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/kernel/sys.c b/kernel/sys.c index 2410e3999ebe..097cbea62a72 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1786,6 +1786,7 @@ void getrusage(struct task_struct *p, int who, struct rusage *r) unsigned long flags; u64 tgutime, tgstime, utime, stime; unsigned long maxrss = 0; + struct signal_struct *sig = p->signal; memset((char *)r, 0, sizeof (*r)); utime = stime = 0; @@ -1793,7 +1794,7 @@ void getrusage(struct task_struct *p, int who, struct rusage *r) if (who == RUSAGE_THREAD) { task_cputime_adjusted(current, &utime, &stime); accumulate_thread_rusage(p, r); - maxrss = p->signal->maxrss; + maxrss = sig->maxrss; goto out; } @@ -1803,15 +1804,15 @@ void getrusage(struct task_struct *p, int who, struct rusage *r) switch (who) { case RUSAGE_BOTH: case RUSAGE_CHILDREN: - utime = p->signal->cutime; - stime = p->signal->cstime; - r->ru_nvcsw = p->signal->cnvcsw; - r->ru_nivcsw = p->signal->cnivcsw; - r->ru_minflt = p->signal->cmin_flt; - r->ru_majflt = p->signal->cmaj_flt; - r->ru_inblock = p->signal->cinblock; - r->ru_oublock = p->signal->coublock; - maxrss = p->signal->cmaxrss; + utime = sig->cutime; + stime = sig->cstime; + r->ru_nvcsw = sig->cnvcsw; + r->ru_nivcsw = sig->cnivcsw; + r->ru_minflt = sig->cmin_flt; + r->ru_majflt = sig->cmaj_flt; + r->ru_inblock = sig->cinblock; + r->ru_oublock = sig->coublock; + maxrss = sig->cmaxrss; if (who == RUSAGE_CHILDREN) break; @@ -1821,14 +1822,14 @@ void getrusage(struct task_struct *p, int who, struct rusage *r) thread_group_cputime_adjusted(p, &tgutime, &tgstime); utime += tgutime; stime += tgstime; - r->ru_nvcsw += p->signal->nvcsw; - r->ru_nivcsw += p->signal->nivcsw; - r->ru_minflt += p->signal->min_flt; - r->ru_majflt += p->signal->maj_flt; - r->ru_inblock += p->signal->inblock; - r->ru_oublock += p->signal->oublock; - if (maxrss < p->signal->maxrss) - maxrss = p->signal->maxrss; + r->ru_nvcsw += sig->nvcsw; + r->ru_nivcsw += sig->nivcsw; + r->ru_minflt += sig->min_flt; + r->ru_majflt += sig->maj_flt; + r->ru_inblock += sig->inblock; + r->ru_oublock += sig->oublock; + if (maxrss < sig->maxrss) + maxrss = sig->maxrss; t = p; do { accumulate_thread_rusage(t, r); From 13b7bc60b5353371460a203df6c38ccd38ad7a3a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 9 Sep 2023 19:26:29 +0200 Subject: [PATCH 16/64] getrusage: use __for_each_thread() do/while_each_thread should be avoided when possible. Plus this change allows to avoid lock_task_sighand(), we can use rcu and/or sig->stats_lock instead. Link: https://lkml.kernel.org/r/20230909172629.GA20454@redhat.com Signed-off-by: Oleg Nesterov Cc: Eric W. Biederman Signed-off-by: Andrew Morton --- kernel/sys.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernel/sys.c b/kernel/sys.c index 097cbea62a72..67436d465be4 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1830,10 +1830,8 @@ void getrusage(struct task_struct *p, int who, struct rusage *r) r->ru_oublock += sig->oublock; if (maxrss < sig->maxrss) maxrss = sig->maxrss; - t = p; - do { + __for_each_thread(sig, t) accumulate_thread_rusage(t, r); - } while_each_thread(p, t); break; default: From ed5378a387fd7c382497f2abcf4605e030b64044 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 9 Sep 2023 23:49:51 +0200 Subject: [PATCH 17/64] taskstats: fill_stats_for_tgid: use for_each_thread() do/while_each_thread should be avoided when possible. Plus I _think_ this change allows to avoid lock_task_sighand() but I am not sure, I forgot everything about taskstats. In any case, this code does not look right in that the same thread can be accounted twice: taskstats_exit() can account the exiting thread in signal->stats and drop ->siglock but this thread is still on the thread-group list, so lock_task_sighand() can't help. Link: https://lkml.kernel.org/r/20230909214951.GA24274@redhat.com Signed-off-by: Oleg Nesterov Cc: Eric W. Biederman Signed-off-by: Andrew Morton --- kernel/taskstats.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 8ce3fa0c19e2..4354ea231fab 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -233,9 +233,8 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats) else memset(stats, 0, sizeof(*stats)); - tsk = first; start_time = ktime_get_ns(); - do { + for_each_thread(first, tsk) { if (tsk->exit_state) continue; /* @@ -258,7 +257,7 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats) stats->nvcsw += tsk->nvcsw; stats->nivcsw += tsk->nivcsw; - } while_each_thread(first, tsk); + } unlock_task_sighand(first, &flags); rc = 0; From 6309727ef27162deabd5c095c11af24970fba5a2 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Fri, 8 Sep 2023 01:40:48 +0200 Subject: [PATCH 18/64] kthread: add kthread_stop_put Add a kthread_stop_put() helper that stops a thread and puts its task struct. Use it to replace the various instances of kthread_stop() followed by put_task_struct(). Remove the kthread_stop_put() macro in usbip that is similar but doesn't return the result of kthread_stop(). [agruenba@redhat.com: fix kerneldoc comment] Link: https://lkml.kernel.org/r/20230911111730.2565537-1-agruenba@redhat.com [akpm@linux-foundation.org: document kthread_stop_put()'s argument] Link: https://lkml.kernel.org/r/20230907234048.2499820-1-agruenba@redhat.com Signed-off-by: Andreas Gruenbacher Signed-off-by: Andrew Morton --- drivers/accel/ivpu/ivpu_job.c | 3 +-- drivers/dma-buf/st-dma-fence-chain.c | 12 ++++-------- drivers/dma-buf/st-dma-fence.c | 4 +--- drivers/gpu/drm/i915/gt/selftest_migrate.c | 4 +--- drivers/net/xen-netback/interface.c | 3 +-- drivers/usb/usbip/usbip_common.h | 6 ------ fs/gfs2/ops_fstype.c | 9 +++------ include/linux/kthread.h | 1 + kernel/irq/manage.c | 15 +++++---------- kernel/kthread.c | 18 ++++++++++++++++++ kernel/smpboot.c | 3 +-- mm/damon/core.c | 3 +-- net/core/pktgen.c | 3 +-- 13 files changed, 38 insertions(+), 46 deletions(-) diff --git a/drivers/accel/ivpu/ivpu_job.c b/drivers/accel/ivpu/ivpu_job.c index de9e69f70af7..76f468c9f761 100644 --- a/drivers/accel/ivpu/ivpu_job.c +++ b/drivers/accel/ivpu/ivpu_job.c @@ -618,6 +618,5 @@ int ivpu_job_done_thread_init(struct ivpu_device *vdev) void ivpu_job_done_thread_fini(struct ivpu_device *vdev) { - kthread_stop(vdev->job_done_thread); - put_task_struct(vdev->job_done_thread); + kthread_stop_put(vdev->job_done_thread); } diff --git a/drivers/dma-buf/st-dma-fence-chain.c b/drivers/dma-buf/st-dma-fence-chain.c index c0979c8049b5..9c2a0c082a76 100644 --- a/drivers/dma-buf/st-dma-fence-chain.c +++ b/drivers/dma-buf/st-dma-fence-chain.c @@ -476,10 +476,9 @@ static int find_race(void *arg) for (i = 0; i < ncpus; i++) { int ret; - ret = kthread_stop(threads[i]); + ret = kthread_stop_put(threads[i]); if (ret && !err) err = ret; - put_task_struct(threads[i]); } kfree(threads); @@ -591,8 +590,7 @@ static int wait_forward(void *arg) for (i = 0; i < fc.chain_length; i++) dma_fence_signal(fc.fences[i]); - err = kthread_stop(tsk); - put_task_struct(tsk); + err = kthread_stop_put(tsk); err: fence_chains_fini(&fc); @@ -621,8 +619,7 @@ static int wait_backward(void *arg) for (i = fc.chain_length; i--; ) dma_fence_signal(fc.fences[i]); - err = kthread_stop(tsk); - put_task_struct(tsk); + err = kthread_stop_put(tsk); err: fence_chains_fini(&fc); @@ -669,8 +666,7 @@ static int wait_random(void *arg) for (i = 0; i < fc.chain_length; i++) dma_fence_signal(fc.fences[i]); - err = kthread_stop(tsk); - put_task_struct(tsk); + err = kthread_stop_put(tsk); err: fence_chains_fini(&fc); diff --git a/drivers/dma-buf/st-dma-fence.c b/drivers/dma-buf/st-dma-fence.c index fb6e0a6ae2c9..b7c6f7ea9e0c 100644 --- a/drivers/dma-buf/st-dma-fence.c +++ b/drivers/dma-buf/st-dma-fence.c @@ -548,11 +548,9 @@ static int race_signal_callback(void *arg) for (i = 0; i < ARRAY_SIZE(t); i++) { int err; - err = kthread_stop(t[i].task); + err = kthread_stop_put(t[i].task); if (err && !ret) ret = err; - - put_task_struct(t[i].task); } } diff --git a/drivers/gpu/drm/i915/gt/selftest_migrate.c b/drivers/gpu/drm/i915/gt/selftest_migrate.c index 3def5ca72dec..0fb07f073baa 100644 --- a/drivers/gpu/drm/i915/gt/selftest_migrate.c +++ b/drivers/gpu/drm/i915/gt/selftest_migrate.c @@ -719,11 +719,9 @@ static int threaded_migrate(struct intel_migrate *migrate, if (IS_ERR_OR_NULL(tsk)) continue; - status = kthread_stop(tsk); + status = kthread_stop_put(tsk); if (status && !err) err = status; - - put_task_struct(tsk); } kfree(thread); diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c index f3f2c07423a6..33c8143619f0 100644 --- a/drivers/net/xen-netback/interface.c +++ b/drivers/net/xen-netback/interface.c @@ -672,8 +672,7 @@ err: static void xenvif_disconnect_queue(struct xenvif_queue *queue) { if (queue->task) { - kthread_stop(queue->task); - put_task_struct(queue->task); + kthread_stop_put(queue->task); queue->task = NULL; } diff --git a/drivers/usb/usbip/usbip_common.h b/drivers/usb/usbip/usbip_common.h index d8cbd2dfc2c2..282efca64a01 100644 --- a/drivers/usb/usbip/usbip_common.h +++ b/drivers/usb/usbip/usbip_common.h @@ -298,12 +298,6 @@ struct usbip_device { __k; \ }) -#define kthread_stop_put(k) \ - do { \ - kthread_stop(k); \ - put_task_struct(k); \ - } while (0) - /* usbip_common.c */ void usbip_dump_urb(struct urb *purb); void usbip_dump_header(struct usbip_header *pdu); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 33ca04733e93..ecf789b7168c 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1126,8 +1126,7 @@ static int init_threads(struct gfs2_sbd *sdp) return 0; fail: - kthread_stop(sdp->sd_logd_process); - put_task_struct(sdp->sd_logd_process); + kthread_stop_put(sdp->sd_logd_process); sdp->sd_logd_process = NULL; return error; } @@ -1135,13 +1134,11 @@ fail: void gfs2_destroy_threads(struct gfs2_sbd *sdp) { if (sdp->sd_logd_process) { - kthread_stop(sdp->sd_logd_process); - put_task_struct(sdp->sd_logd_process); + kthread_stop_put(sdp->sd_logd_process); sdp->sd_logd_process = NULL; } if (sdp->sd_quotad_process) { - kthread_stop(sdp->sd_quotad_process); - put_task_struct(sdp->sd_quotad_process); + kthread_stop_put(sdp->sd_quotad_process); sdp->sd_quotad_process = NULL; } } diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 2c30ade43bc8..b11f53c1ba2e 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -86,6 +86,7 @@ void free_kthread_struct(struct task_struct *k); void kthread_bind(struct task_struct *k, unsigned int cpu); void kthread_bind_mask(struct task_struct *k, const struct cpumask *mask); int kthread_stop(struct task_struct *k); +int kthread_stop_put(struct task_struct *k); bool kthread_should_stop(void); bool kthread_should_park(void); bool kthread_should_stop_or_park(void); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index d309ba84e08a..1782f90cd8c6 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1852,15 +1852,13 @@ out_thread: struct task_struct *t = new->thread; new->thread = NULL; - kthread_stop(t); - put_task_struct(t); + kthread_stop_put(t); } if (new->secondary && new->secondary->thread) { struct task_struct *t = new->secondary->thread; new->secondary->thread = NULL; - kthread_stop(t); - put_task_struct(t); + kthread_stop_put(t); } out_mput: module_put(desc->owner); @@ -1971,12 +1969,9 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id) * the same bit to a newly requested action. */ if (action->thread) { - kthread_stop(action->thread); - put_task_struct(action->thread); - if (action->secondary && action->secondary->thread) { - kthread_stop(action->secondary->thread); - put_task_struct(action->secondary->thread); - } + kthread_stop_put(action->thread); + if (action->secondary && action->secondary->thread) + kthread_stop_put(action->secondary->thread); } /* Last action releases resources */ diff --git a/kernel/kthread.c b/kernel/kthread.c index 1eea53050bab..290cbc845225 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -715,6 +715,24 @@ int kthread_stop(struct task_struct *k) } EXPORT_SYMBOL(kthread_stop); +/** + * kthread_stop_put - stop a thread and put its task struct + * @k: thread created by kthread_create(). + * + * Stops a thread created by kthread_create() and put its task_struct. + * Only use when holding an extra task struct reference obtained by + * calling get_task_struct(). + */ +int kthread_stop_put(struct task_struct *k) +{ + int ret; + + ret = kthread_stop(k); + put_task_struct(k); + return ret; +} +EXPORT_SYMBOL(kthread_stop_put); + int kthreadd(void *unused) { struct task_struct *tsk = current; diff --git a/kernel/smpboot.c b/kernel/smpboot.c index f47d8f375946..1992b62e980b 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -272,8 +272,7 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht) struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); if (tsk) { - kthread_stop(tsk); - put_task_struct(tsk); + kthread_stop_put(tsk); *per_cpu_ptr(ht->store, cpu) = NULL; } } diff --git a/mm/damon/core.c b/mm/damon/core.c index bcd2bd9d6c10..2f54f153d7f5 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -699,8 +699,7 @@ static int __damon_stop(struct damon_ctx *ctx) if (tsk) { get_task_struct(tsk); mutex_unlock(&ctx->kdamond_lock); - kthread_stop(tsk); - put_task_struct(tsk); + kthread_stop_put(tsk); return 0; } mutex_unlock(&ctx->kdamond_lock); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index f56b8d697014..826250a0f5b1 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3982,8 +3982,7 @@ static void __net_exit pg_net_exit(struct net *net) list_for_each_safe(q, n, &list) { t = list_entry(q, struct pktgen_thread, th_list); list_del(&t->th_list); - kthread_stop(t->tsk); - put_task_struct(t->tsk); + kthread_stop_put(t->tsk); kfree(t); } From 5e57418a2031cd5e1863efdf3d7447a16a368172 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 11 Sep 2023 18:49:13 +0300 Subject: [PATCH 19/64] minmax: deduplicate __unconst_integer_typeof() It appears that compiler_types.h already have an implementation of the __unconst_integer_typeof() called __unqual_scalar_typeof(). Use it instead of the copy. Link: https://lkml.kernel.org/r/20230911154913.4176033-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Acked-by: Herve Codina Signed-off-by: Andrew Morton --- include/linux/minmax.h | 26 +++----------------------- 1 file changed, 3 insertions(+), 23 deletions(-) diff --git a/include/linux/minmax.h b/include/linux/minmax.h index 83aebc244cba..69bbe987fa87 100644 --- a/include/linux/minmax.h +++ b/include/linux/minmax.h @@ -2,6 +2,7 @@ #ifndef _LINUX_MINMAX_H #define _LINUX_MINMAX_H +#include #include #include @@ -134,27 +135,6 @@ */ #define max_t(type, x, y) __careful_cmp((type)(x), (type)(y), >) -/* - * Remove a const qualifier from integer types - * _Generic(foo, type-name: association, ..., default: association) performs a - * comparison against the foo type (not the qualified type). - * Do not use the const keyword in the type-name as it will not match the - * unqualified type of foo. - */ -#define __unconst_integer_type_cases(type) \ - unsigned type: (unsigned type)0, \ - signed type: (signed type)0 - -#define __unconst_integer_typeof(x) typeof( \ - _Generic((x), \ - char: (char)0, \ - __unconst_integer_type_cases(char), \ - __unconst_integer_type_cases(short), \ - __unconst_integer_type_cases(int), \ - __unconst_integer_type_cases(long), \ - __unconst_integer_type_cases(long long), \ - default: (x))) - /* * Do not check the array parameter using __must_be_array(). * In the following legit use-case where the "array" passed is a simple pointer, @@ -169,13 +149,13 @@ * 'int *buff' and 'int buff[N]' types. * * The array can be an array of const items. - * typeof() keeps the const qualifier. Use __unconst_integer_typeof() in order + * typeof() keeps the const qualifier. Use __unqual_scalar_typeof() in order * to discard the const qualifier for the __element variable. */ #define __minmax_array(op, array, len) ({ \ typeof(&(array)[0]) __array = (array); \ typeof(len) __len = (len); \ - __unconst_integer_typeof(__array[0]) __element = __array[--__len]; \ + __unqual_scalar_typeof(__array[0]) __element = __array[--__len];\ while (__len--) \ __element = op(__element, __array[__len]); \ __element; }) From 2d57792a39e5473c5d98fc0138c4e9ab9c98a57b Mon Sep 17 00:00:00 2001 From: Rong Tao Date: Mon, 11 Sep 2023 22:55:09 +0800 Subject: [PATCH 20/64] pid: pid_ns_ctl_handler: remove useless comment commit 95846ecf9dac("pid: replace pid bitmap implementation with IDR API") removes 'last_pid' element, and use the idr_get_cursor-idr_set_cursor pair to set the value of idr, so useless comments should be removed. Link: https://lkml.kernel.org/r/tencent_157A2A1CAF19A3F5885F0687426159A19708@qq.com Signed-off-by: Rong Tao Cc: Aleksa Sarai Cc: Christian Brauner Cc: Frederic Weisbecker Cc: Jeff Xu Cc: Kees Cook Cc: Luis Chamberlain Signed-off-by: Andrew Morton --- kernel/pid_namespace.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 619972c78774..3028b2218aa4 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -286,12 +286,6 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write, if (write && !checkpoint_restore_ns_capable(pid_ns->user_ns)) return -EPERM; - /* - * Writing directly to ns' last_pid field is OK, since this field - * is volatile in a living namespace anyway and a code writing to - * it should synchronize its usage with external means. - */ - next = idr_get_cursor(&pid_ns->idr) - 1; tmp.data = &next; From f6e9d38f8eb00ac8b52e6d15f6aa9bcecacb081b Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 12 Sep 2023 12:23:55 +0300 Subject: [PATCH 21/64] minmax: fix header inclusions BUILD_BUG_ON*() macros are defined in build_bug.h. Include it. Replace compiler_types.h by compiler.h, which provides the former, to have a definition of the __UNIQUE_ID(). Link: https://lkml.kernel.org/r/20230912092355.79280-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Reviewed-by: Herve Codina Cc: Rasmus Villemoes Signed-off-by: Andrew Morton --- include/linux/minmax.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/minmax.h b/include/linux/minmax.h index 69bbe987fa87..ca69abd6151e 100644 --- a/include/linux/minmax.h +++ b/include/linux/minmax.h @@ -2,7 +2,8 @@ #ifndef _LINUX_MINMAX_H #define _LINUX_MINMAX_H -#include +#include +#include #include #include From e22c3872e4d58191538a2c31d47bb6bcec0fdfa0 Mon Sep 17 00:00:00 2001 From: Azeem Shaikh Date: Thu, 31 Aug 2023 19:38:27 +0000 Subject: [PATCH 22/64] fs: ocfs2: replace strlcpy with sysfs_emit strlcpy() reads the entire source buffer first. This read may exceed the destination size limit. This is both inefficient and can lead to linear read overflows if a source string is not NUL-terminated [1]. In an effort to remove strlcpy() completely [2], replace strlcpy() here with sysfs_emit(). Direct replacement is safe here since its ok for `kernel_param_ops.get()` to return -errno [3]. [1] https://www.kernel.org/doc/html/latest/process/deprecated.html#strlcpy [2] https://github.com/KSPP/linux/issues/89 [3] https://elixir.bootlin.com/linux/v6.5/source/include/linux/moduleparam.h#L52 Link: https://lkml.kernel.org/r/20230831193827.1528867-1-azeemshaikh38@gmail.com Signed-off-by: Azeem Shaikh Reviewed-by: Kees Cook Reviewed-by: Joseph Qi Cc: Christian Brauner Cc: Dave Chinner Cc: Jan Kara Cc: Jeff Layton Cc: Joel Becker Cc: Mark Fasheh Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/dlmfs/dlmfs.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 81265123ce6c..b38776ba3306 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -80,8 +80,7 @@ static int param_set_dlmfs_capabilities(const char *val, static int param_get_dlmfs_capabilities(char *buffer, const struct kernel_param *kp) { - return strlcpy(buffer, DLMFS_CAPABILITIES, - strlen(DLMFS_CAPABILITIES) + 1); + return sysfs_emit(buffer, DLMFS_CAPABILITIES); } module_param_call(capabilities, param_set_dlmfs_capabilities, param_get_dlmfs_capabilities, NULL, 0444); From a6304272b03ece97346f16923453f7e36ec19a5a Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 14 Sep 2023 11:31:34 +0800 Subject: [PATCH 23/64] crash_core.c: remove unnecessary parameter of function Patch series "kdump: use generic functions to simplify crashkernel reservation in arch", v3. In the current arm64, crashkernel=,high support has been finished after several rounds of posting and careful reviewing. The code in arm64 which parses crashkernel kernel parameters firstly, then reserve memory can be a good example for other ARCH to refer to. Whereas in x86_64, the code mixing crashkernel parameter parsing and memory reserving is twisted, and looks messy. Refactoring the code to make it more readable maintainable is necessary. Here, firstly abstract the crashkernel parameter parsing code into parse_crashkernel() to make it be able to parse crashkernel=,high|low. Then abstract the crashkernel memory reserving code into a generic function reserve_crashkernel_generic(). Finally, in ARCH which crashkernel=,high support is needed, a simple arch_reserve_crashkernel() can be added to call above two functions. This can remove the duplicated implmentation code in each ARCH, like arm64, x86_64 and riscv. crashkernel=512M,high crashkernel=512M,high crashkernel=256M,low crashkernel=512M,high crashkernel=0M,low crashkernel=0M,high crashkernel=256M,low crashkernel=512M crashkernel=512M@0x4f000000 crashkernel=1G-4G:256M,4G-64G:320M,64G-:576M crashkernel=0M This patch (of 9): In all call sites of __parse_crashkernel(), the parameter 'name' is hardcoded as "crashkernel=". So remove the unnecessary parameter 'name', add local varibale 'name' inside __parse_crashkernel() instead. Link: https://lkml.kernel.org/r/20230914033142.676708-1-bhe@redhat.com Link: https://lkml.kernel.org/r/20230914033142.676708-2-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Zhen Lei Cc: Catalin Marinas Cc: Chen Jiahao Cc: Zhen Lei Signed-off-by: Andrew Morton --- kernel/crash_core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 2f675ef045d4..507113932aa9 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -248,11 +248,11 @@ static int __init __parse_crashkernel(char *cmdline, unsigned long long system_ram, unsigned long long *crash_size, unsigned long long *crash_base, - const char *name, const char *suffix) { char *first_colon, *first_space; char *ck_cmdline; + char *name = "crashkernel="; BUG_ON(!crash_size || !crash_base); *crash_size = 0; @@ -290,7 +290,7 @@ int __init parse_crashkernel(char *cmdline, unsigned long long *crash_base) { return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, - "crashkernel=", NULL); + NULL); } int __init parse_crashkernel_high(char *cmdline, @@ -299,7 +299,7 @@ int __init parse_crashkernel_high(char *cmdline, unsigned long long *crash_base) { return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, - "crashkernel=", suffix_tbl[SUFFIX_HIGH]); + suffix_tbl[SUFFIX_HIGH]); } int __init parse_crashkernel_low(char *cmdline, @@ -308,7 +308,7 @@ int __init parse_crashkernel_low(char *cmdline, unsigned long long *crash_base) { return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, - "crashkernel=", suffix_tbl[SUFFIX_LOW]); + suffix_tbl[SUFFIX_LOW]); } /* From a9e1a3d84e4a0ea560ed4d84c28d06dbfdffed22 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 14 Sep 2023 11:31:35 +0800 Subject: [PATCH 24/64] crash_core: change the prototype of function parse_crashkernel() Add two parameters 'low_size' and 'high' to function parse_crashkernel(), later crashkernel=,high|low parsing will be added. Make adjustments in all call sites of parse_crashkernel() in arch. Link: https://lkml.kernel.org/r/20230914033142.676708-3-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Zhen Lei Cc: Catalin Marinas Cc: Chen Jiahao Signed-off-by: Andrew Morton --- arch/arm/kernel/setup.c | 3 ++- arch/arm64/mm/init.c | 2 +- arch/ia64/kernel/setup.c | 2 +- arch/loongarch/kernel/setup.c | 4 +++- arch/mips/kernel/setup.c | 3 ++- arch/powerpc/kernel/fadump.c | 2 +- arch/powerpc/kexec/core.c | 2 +- arch/powerpc/mm/nohash/kaslr_booke.c | 2 +- arch/riscv/mm/init.c | 2 +- arch/s390/kernel/setup.c | 4 ++-- arch/sh/kernel/machine_kexec.c | 2 +- arch/x86/kernel/setup.c | 3 ++- include/linux/crash_core.h | 3 ++- kernel/crash_core.c | 15 ++++++++++++--- 14 files changed, 32 insertions(+), 17 deletions(-) diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index c66b560562b3..e2bb7afd0683 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c @@ -1010,7 +1010,8 @@ static void __init reserve_crashkernel(void) total_mem = get_total_mem(); ret = parse_crashkernel(boot_command_line, total_mem, - &crash_size, &crash_base); + &crash_size, &crash_base, + NULL, NULL); /* invalid value specified or crashkernel=0 */ if (ret || !crash_size) return; diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 8a0f8604348b..801c59c39a8f 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -142,7 +142,7 @@ static void __init reserve_crashkernel(void) /* crashkernel=X[@offset] */ ret = parse_crashkernel(cmdline, memblock_phys_mem_size(), - &crash_size, &crash_base); + &crash_size, &crash_base, NULL, NULL); if (ret == -ENOENT) { ret = parse_crashkernel_high(cmdline, 0, &crash_size, &crash_base); if (ret || !crash_size) diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c index 5a55ac82c13a..4faea2d2cf07 100644 --- a/arch/ia64/kernel/setup.c +++ b/arch/ia64/kernel/setup.c @@ -277,7 +277,7 @@ static void __init setup_crashkernel(unsigned long total, int *n) int ret; ret = parse_crashkernel(boot_command_line, total, - &size, &base); + &size, &base, NULL, NULL); if (ret == 0 && size > 0) { if (!base) { sort_regions(rsvd_region, *n); diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c index 7783f0a3d742..4de32b07c0dc 100644 --- a/arch/loongarch/kernel/setup.c +++ b/arch/loongarch/kernel/setup.c @@ -267,7 +267,9 @@ static void __init arch_parse_crashkernel(void) unsigned long long crash_base, crash_size; total_mem = memblock_phys_mem_size(); - ret = parse_crashkernel(boot_command_line, total_mem, &crash_size, &crash_base); + ret = parse_crashkernel(boot_command_line, total_mem, + &crash_size, &crash_base, + NULL, NULL); if (ret < 0 || crash_size <= 0) return; diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c index cb871eb784a7..08321c945ac4 100644 --- a/arch/mips/kernel/setup.c +++ b/arch/mips/kernel/setup.c @@ -460,7 +460,8 @@ static void __init mips_parse_crashkernel(void) total_mem = memblock_phys_mem_size(); ret = parse_crashkernel(boot_command_line, total_mem, - &crash_size, &crash_base); + &crash_size, &crash_base, + NULL, NULL); if (ret != 0 || crash_size <= 0) return; diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 3ff2da7b120b..d14eda1e8589 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -313,7 +313,7 @@ static __init u64 fadump_calculate_reserve_size(void) * memory at a predefined offset. */ ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), - &size, &base); + &size, &base, NULL, NULL); if (ret == 0 && size > 0) { unsigned long max_size; diff --git a/arch/powerpc/kexec/core.c b/arch/powerpc/kexec/core.c index de64c7962991..9346c960b296 100644 --- a/arch/powerpc/kexec/core.c +++ b/arch/powerpc/kexec/core.c @@ -109,7 +109,7 @@ void __init reserve_crashkernel(void) total_mem_sz = memory_limit ? memory_limit : memblock_phys_mem_size(); /* use common parsing */ ret = parse_crashkernel(boot_command_line, total_mem_sz, - &crash_size, &crash_base); + &crash_size, &crash_base, NULL, NULL); if (ret == 0 && crash_size > 0) { crashk_res.start = crash_base; crashk_res.end = crash_base + crash_size - 1; diff --git a/arch/powerpc/mm/nohash/kaslr_booke.c b/arch/powerpc/mm/nohash/kaslr_booke.c index 2fb3edafe9ab..b4f2786a7d2b 100644 --- a/arch/powerpc/mm/nohash/kaslr_booke.c +++ b/arch/powerpc/mm/nohash/kaslr_booke.c @@ -178,7 +178,7 @@ static void __init get_crash_kernel(void *fdt, unsigned long size) int ret; ret = parse_crashkernel(boot_command_line, size, &crash_size, - &crash_base); + &crash_base, NULL, NULL); if (ret != 0 || crash_size == 0) return; if (crash_base == 0) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 0798bd861dcb..9fe448900059 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -1388,7 +1388,7 @@ static void __init reserve_crashkernel(void) } ret = parse_crashkernel(cmdline, memblock_phys_mem_size(), - &crash_size, &crash_base); + &crash_size, &crash_base, NULL, NULL); if (ret == -ENOENT) { /* Fallback to crashkernel=X,[high,low] */ ret = parse_crashkernel_high(cmdline, 0, &crash_size, &crash_base); diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index de6ad0fb2328..e555b576d3c8 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -625,8 +625,8 @@ static void __init reserve_crashkernel(void) phys_addr_t low, high; int rc; - rc = parse_crashkernel(boot_command_line, ident_map_size, &crash_size, - &crash_base); + rc = parse_crashkernel(boot_command_line, ident_map_size, + &crash_size, &crash_base, NULL, NULL); crash_base = ALIGN(crash_base, KEXEC_CRASH_MEM_ALIGN); crash_size = ALIGN(crash_size, KEXEC_CRASH_MEM_ALIGN); diff --git a/arch/sh/kernel/machine_kexec.c b/arch/sh/kernel/machine_kexec.c index 223c14f44af7..fa3a7b36190a 100644 --- a/arch/sh/kernel/machine_kexec.c +++ b/arch/sh/kernel/machine_kexec.c @@ -154,7 +154,7 @@ void __init reserve_crashkernel(void) int ret; ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), - &crash_size, &crash_base); + &crash_size, &crash_base, NULL, NULL); if (ret == 0 && crash_size > 0) { crashk_res.start = crash_base; crashk_res.end = crash_base + crash_size - 1; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index b098b1fa2470..655c04812905 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -553,7 +553,8 @@ static void __init reserve_crashkernel(void) total_mem = memblock_phys_mem_size(); /* crashkernel=XM */ - ret = parse_crashkernel(boot_command_line, total_mem, &crash_size, &crash_base); + ret = parse_crashkernel(boot_command_line, total_mem, + &crash_size, &crash_base, NULL, NULL); if (ret != 0 || crash_size <= 0) { /* crashkernel=X,high */ ret = parse_crashkernel_high(boot_command_line, total_mem, diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index 0c06561bf5ff..6156355ef831 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -80,7 +80,8 @@ Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, void final_note(Elf_Word *buf); int __init parse_crashkernel(char *cmdline, unsigned long long system_ram, - unsigned long long *crash_size, unsigned long long *crash_base); + unsigned long long *crash_size, unsigned long long *crash_base, + unsigned long long *low_size, bool *high); int parse_crashkernel_high(char *cmdline, unsigned long long system_ram, unsigned long long *crash_size, unsigned long long *crash_base); int parse_crashkernel_low(char *cmdline, unsigned long long system_ram, diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 507113932aa9..33ced5b5ed4e 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -287,10 +287,19 @@ static int __init __parse_crashkernel(char *cmdline, int __init parse_crashkernel(char *cmdline, unsigned long long system_ram, unsigned long long *crash_size, - unsigned long long *crash_base) + unsigned long long *crash_base, + unsigned long long *low_size, + bool *high) { - return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, - NULL); + int ret; + + /* crashkernel=X[@offset] */ + ret = __parse_crashkernel(cmdline, system_ram, crash_size, + crash_base, NULL); + if (!high) + return ret; + + return 0; } int __init parse_crashkernel_high(char *cmdline, From 70916e9c8d9f1a286c99727072b22e395097909f Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 14 Sep 2023 11:31:36 +0800 Subject: [PATCH 25/64] crash_core: change parse_crashkernel() to support crashkernel=,high|low parsing Now parse_crashkernel() is a real entry point for all kinds of crahskernel parsing on any architecture. And wrap the crahskernel=,high|low handling inside CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION ifdeffery scope. Link: https://lkml.kernel.org/r/20230914033142.676708-4-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Zhen Lei Cc: Catalin Marinas Cc: Chen Jiahao Signed-off-by: Andrew Morton --- include/linux/crash_core.h | 6 ++++++ kernel/crash_core.c | 36 +++++++++++++++++++++++++++++++++--- 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index 6156355ef831..d8050a7eab01 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -79,6 +79,12 @@ Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, void *data, size_t data_len); void final_note(Elf_Word *buf); +#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION +#ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE +#define DEFAULT_CRASH_KERNEL_LOW_SIZE (128UL << 20) +#endif +#endif + int __init parse_crashkernel(char *cmdline, unsigned long long system_ram, unsigned long long *crash_size, unsigned long long *crash_base, unsigned long long *low_size, bool *high); diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 33ced5b5ed4e..99a243540a35 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -283,6 +283,9 @@ static int __init __parse_crashkernel(char *cmdline, /* * That function is the entry point for command line parsing and should be * called from the arch-specific code. + * + * If crashkernel=,high|low is supported on architecture, non-NULL values + * should be passed to parameters 'low_size' and 'high'. */ int __init parse_crashkernel(char *cmdline, unsigned long long system_ram, @@ -296,10 +299,37 @@ int __init parse_crashkernel(char *cmdline, /* crashkernel=X[@offset] */ ret = __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, NULL); - if (!high) - return ret; +#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION + /* + * If non-NULL 'high' passed in and no normal crashkernel + * setting detected, try parsing crashkernel=,high|low. + */ + if (high && ret == -ENOENT) { + ret = __parse_crashkernel(cmdline, 0, crash_size, + crash_base, suffix_tbl[SUFFIX_HIGH]); + if (ret || !*crash_size) + return -EINVAL; - return 0; + /* + * crashkernel=Y,low can be specified or not, but invalid value + * is not allowed. + */ + ret = __parse_crashkernel(cmdline, 0, low_size, + crash_base, suffix_tbl[SUFFIX_LOW]); + if (ret == -ENOENT) { + *low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE; + ret = 0; + } else if (ret) { + return ret; + } + + *high = true; + } +#endif + if (!*crash_size) + ret = -EINVAL; + + return ret; } int __init parse_crashkernel_high(char *cmdline, From 0ab97169aa0517079b22c2e64192906caa5dc6d5 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 14 Sep 2023 11:31:37 +0800 Subject: [PATCH 26/64] crash_core: add generic function to do reservation In architecture like x86_64, arm64 and riscv, they have vast virtual address space and usually have huge physical memory RAM. Their crashkernel reservation doesn't have to be limited under 4G RAM, but can be extended to the whole physical memory via crashkernel=,high support. Now add function reserve_crashkernel_generic() to reserve crashkernel memory if users specify any case of kernel pamameters, like crashkernel=xM[@offset] or crashkernel=,high|low. This is preparation to simplify code of crashkernel=,high support in architecutures. Link: https://lkml.kernel.org/r/20230914033142.676708-5-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Zhen Lei Cc: Catalin Marinas Cc: Chen Jiahao Signed-off-by: Andrew Morton --- include/linux/crash_core.h | 28 ++++++++++ kernel/crash_core.c | 107 ++++++++++++++++++++++++++++++++++++- 2 files changed, 134 insertions(+), 1 deletion(-) diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index d8050a7eab01..4dbd6565e0ff 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -93,6 +93,34 @@ int parse_crashkernel_high(char *cmdline, unsigned long long system_ram, int parse_crashkernel_low(char *cmdline, unsigned long long system_ram, unsigned long long *crash_size, unsigned long long *crash_base); +#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION +#ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE +#define DEFAULT_CRASH_KERNEL_LOW_SIZE (128UL << 20) +#endif +#ifndef CRASH_ALIGN +#define CRASH_ALIGN SZ_2M +#endif +#ifndef CRASH_ADDR_LOW_MAX +#define CRASH_ADDR_LOW_MAX SZ_4G +#endif +#ifndef CRASH_ADDR_HIGH_MAX +#define CRASH_ADDR_HIGH_MAX memblock_end_of_DRAM() +#endif + +void __init reserve_crashkernel_generic(char *cmdline, + unsigned long long crash_size, + unsigned long long crash_base, + unsigned long long crash_low_size, + bool high); +#else +static inline void __init reserve_crashkernel_generic(char *cmdline, + unsigned long long crash_size, + unsigned long long crash_base, + unsigned long long crash_low_size, + bool high) +{} +#endif + /* Alignment required for elf header segment */ #define ELF_CORE_HEADER_ALIGN 4096 diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 99a243540a35..72e358197b52 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -5,7 +5,6 @@ */ #include -#include #include #include #include @@ -13,6 +12,9 @@ #include #include #include +#include +#include +#include #include #include @@ -360,6 +362,109 @@ static int __init parse_crashkernel_dummy(char *arg) } early_param("crashkernel", parse_crashkernel_dummy); +#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION +static int __init reserve_crashkernel_low(unsigned long long low_size) +{ +#ifdef CONFIG_64BIT + unsigned long long low_base; + + low_base = memblock_phys_alloc_range(low_size, CRASH_ALIGN, 0, CRASH_ADDR_LOW_MAX); + if (!low_base) { + pr_err("cannot allocate crashkernel low memory (size:0x%llx).\n", low_size); + return -ENOMEM; + } + + pr_info("crashkernel low memory reserved: 0x%08llx - 0x%08llx (%lld MB)\n", + low_base, low_base + low_size, low_size >> 20); + + crashk_low_res.start = low_base; + crashk_low_res.end = low_base + low_size - 1; + insert_resource(&iomem_resource, &crashk_low_res); +#endif + return 0; +} + +void __init reserve_crashkernel_generic(char *cmdline, + unsigned long long crash_size, + unsigned long long crash_base, + unsigned long long crash_low_size, + bool high) +{ + unsigned long long search_end = CRASH_ADDR_LOW_MAX, search_base = 0; + bool fixed_base = false; + + /* User specifies base address explicitly. */ + if (crash_base) { + fixed_base = true; + search_base = crash_base; + search_end = crash_base + crash_size; + } else if (high) { + search_base = CRASH_ADDR_LOW_MAX; + search_end = CRASH_ADDR_HIGH_MAX; + } + +retry: + crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN, + search_base, search_end); + if (!crash_base) { + /* + * For crashkernel=size[KMG]@offset[KMG], print out failure + * message if can't reserve the specified region. + */ + if (fixed_base) { + pr_warn("crashkernel reservation failed - memory is in use.\n"); + return; + } + + /* + * For crashkernel=size[KMG], if the first attempt was for + * low memory, fall back to high memory, the minimum required + * low memory will be reserved later. + */ + if (!high && search_end == CRASH_ADDR_LOW_MAX) { + search_end = CRASH_ADDR_HIGH_MAX; + search_base = CRASH_ADDR_LOW_MAX; + crash_low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE; + goto retry; + } + + /* + * For crashkernel=size[KMG],high, if the first attempt was + * for high memory, fall back to low memory. + */ + if (high && search_end == CRASH_ADDR_HIGH_MAX) { + search_end = CRASH_ADDR_LOW_MAX; + search_base = 0; + goto retry; + } + pr_warn("cannot allocate crashkernel (size:0x%llx)\n", + crash_size); + return; + } + + if ((crash_base > CRASH_ADDR_LOW_MAX) && + crash_low_size && reserve_crashkernel_low(crash_low_size)) { + memblock_phys_free(crash_base, crash_size); + return; + } + + pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n", + crash_base, crash_base + crash_size, crash_size >> 20); + + /* + * The crashkernel memory will be removed from the kernel linear + * map. Inform kmemleak so that it won't try to access it. + */ + kmemleak_ignore_phys(crash_base); + if (crashk_low_res.end) + kmemleak_ignore_phys(crashk_low_res.start); + + crashk_res.start = crash_base; + crashk_res.end = crash_base + crash_size - 1; + insert_resource(&iomem_resource, &crashk_res); +} +#endif + int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map, void **addr, unsigned long *sz) { From b631b95dded5e7f007a3a79cbaf82ef50c1e2cf7 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 14 Sep 2023 11:31:38 +0800 Subject: [PATCH 27/64] crash_core: move crashk_*res definition into crash_core.c Both crashk_res and crashk_low_res are used to mark the reserved crashkernel regions in iomem_resource tree. And later the generic crashkernel resrvation will be added into crash_core.c. So move crashk_res and crashk_low_res definition into crash_core.c to avoid compiling error if CONFIG_CRASH_CORE=on while CONFIG_KEXEC_CORE is unset. Meanwhile include in if generic reservation is needed. In that case, need be added by ARCH. In asm/crash_core.h, ARCH can provide its own macro definitions to override macros in if needed. Wrap the including into CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION ifdeffery scope to avoid compiling error in other ARCH-es which don't take the generic reservation way yet. Link: https://lkml.kernel.org/r/20230914033142.676708-6-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Zhen Lei Cc: Catalin Marinas Cc: Chen Jiahao Signed-off-by: Andrew Morton --- include/linux/crash_core.h | 8 ++++++++ include/linux/kexec.h | 4 ---- kernel/crash_core.c | 16 ++++++++++++++++ kernel/kexec_core.c | 17 ----------------- 4 files changed, 24 insertions(+), 21 deletions(-) diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index 4dbd6565e0ff..3c735a7e33fb 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -5,6 +5,14 @@ #include #include #include +#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION +#include +#endif + +/* Location of a reserved region to hold the crash kernel. + */ +extern struct resource crashk_res; +extern struct resource crashk_low_res; #define CRASH_CORE_NOTE_NAME "CORE" #define CRASH_CORE_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4) diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 32c78078552c..8227455192b7 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -22,10 +22,6 @@ #include #include -/* Location of a reserved region to hold the crash kernel. - */ -extern struct resource crashk_res; -extern struct resource crashk_low_res; extern note_buf_t __percpu *crash_notes; #ifdef CONFIG_KEXEC_CORE diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 72e358197b52..fa8808c4f00e 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -35,6 +35,22 @@ u32 *vmcoreinfo_note; /* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */ static unsigned char *vmcoreinfo_data_safecopy; +/* Location of the reserved area for the crash kernel */ +struct resource crashk_res = { + .name = "Crash kernel", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, + .desc = IORES_DESC_CRASH_KERNEL +}; +struct resource crashk_low_res = { + .name = "Crash kernel", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, + .desc = IORES_DESC_CRASH_KERNEL +}; + /* * parsing the "crashkernel" commandline * diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 9dc728982d79..be5642a4ec49 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -52,23 +52,6 @@ atomic_t __kexec_lock = ATOMIC_INIT(0); /* Flag to indicate we are going to kexec a new kernel */ bool kexec_in_progress = false; - -/* Location of the reserved area for the crash kernel */ -struct resource crashk_res = { - .name = "Crash kernel", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, - .desc = IORES_DESC_CRASH_KERNEL -}; -struct resource crashk_low_res = { - .name = "Crash kernel", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, - .desc = IORES_DESC_CRASH_KERNEL -}; - int kexec_should_crash(struct task_struct *p) { /* From 9c08a2a139fe83f217625ee0352ce531b9a666ea Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 14 Sep 2023 11:31:39 +0800 Subject: [PATCH 28/64] x86: kdump: use generic interface to simplify crashkernel reservation code With the help of newly changed function parse_crashkernel() and generic reserve_crashkernel_generic(), crashkernel reservation can be simplified by steps: 1) Add a new header file , and define CRASH_ALIGN, CRASH_ADDR_LOW_MAX, CRASH_ADDR_HIGH_MAX and DEFAULT_CRASH_KERNEL_LOW_SIZE in ; 2) Add arch_reserve_crashkernel() to call parse_crashkernel() and reserve_crashkernel_generic(), and do the ARCH specific work if needed. 3) Add ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION Kconfig in arch/x86/Kconfig. When adding DEFAULT_CRASH_KERNEL_LOW_SIZE, add crash_low_size_default() to calculate crashkernel low memory because x86_64 has special requirement. The old reserve_crashkernel_low() and reserve_crashkernel() can be removed. [bhe@redhat.com: move crash_low_size_default() code into ] Link: https://lkml.kernel.org/r/ZQpeAjOmuMJBFw1/@MiWiFi-R3L-srv Link: https://lkml.kernel.org/r/20230914033142.676708-7-bhe@redhat.com Signed-off-by: Baoquan He Cc: Catalin Marinas Cc: Chen Jiahao Cc: Zhen Lei Signed-off-by: Andrew Morton --- arch/x86/Kconfig | 3 + arch/x86/include/asm/crash_core.h | 42 +++++++++ arch/x86/kernel/setup.c | 148 +++--------------------------- 3 files changed, 56 insertions(+), 137 deletions(-) create mode 100644 arch/x86/include/asm/crash_core.h diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 66bfabae8814..36b2f12f31c3 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2062,6 +2062,9 @@ config ARCH_SUPPORTS_CRASH_DUMP config ARCH_SUPPORTS_CRASH_HOTPLUG def_bool y +config ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION + def_bool CRASH_CORE + config PHYSICAL_START hex "Physical address where the kernel is loaded" if (EXPERT || CRASH_DUMP) default "0x1000000" diff --git a/arch/x86/include/asm/crash_core.h b/arch/x86/include/asm/crash_core.h new file mode 100644 index 000000000000..76af98f4e801 --- /dev/null +++ b/arch/x86/include/asm/crash_core.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _X86_CRASH_CORE_H +#define _X86_CRASH_CORE_H + +/* 16M alignment for crash kernel regions */ +#define CRASH_ALIGN SZ_16M + +/* + * Keep the crash kernel below this limit. + * + * Earlier 32-bits kernels would limit the kernel to the low 512 MB range + * due to mapping restrictions. + * + * 64-bit kdump kernels need to be restricted to be under 64 TB, which is + * the upper limit of system RAM in 4-level paging mode. Since the kdump + * jump could be from 5-level paging to 4-level paging, the jump will fail if + * the kernel is put above 64 TB, and during the 1st kernel bootup there's + * no good way to detect the paging mode of the target kernel which will be + * loaded for dumping. + */ +extern unsigned long swiotlb_size_or_default(void); + +#ifdef CONFIG_X86_32 +# define CRASH_ADDR_LOW_MAX SZ_512M +# define CRASH_ADDR_HIGH_MAX SZ_512M +#else +# define CRASH_ADDR_LOW_MAX SZ_4G +# define CRASH_ADDR_HIGH_MAX SZ_64T +#endif + +# define DEFAULT_CRASH_KERNEL_LOW_SIZE crash_low_size_default() + +static inline unsigned long crash_low_size_default(void) +{ +#ifdef CONFIG_X86_64 + return max(swiotlb_size_or_default() + (8UL << 20), 256UL << 20); +#else + return 0; +#endif +} + +#endif /* _X86_CRASH_CORE_H */ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 655c04812905..25a3f9a100f6 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -466,155 +466,29 @@ static void __init memblock_x86_reserve_range_setup_data(void) } } -/* - * --------- Crashkernel reservation ------------------------------ - */ - -/* 16M alignment for crash kernel regions */ -#define CRASH_ALIGN SZ_16M - -/* - * Keep the crash kernel below this limit. - * - * Earlier 32-bits kernels would limit the kernel to the low 512 MB range - * due to mapping restrictions. - * - * 64-bit kdump kernels need to be restricted to be under 64 TB, which is - * the upper limit of system RAM in 4-level paging mode. Since the kdump - * jump could be from 5-level paging to 4-level paging, the jump will fail if - * the kernel is put above 64 TB, and during the 1st kernel bootup there's - * no good way to detect the paging mode of the target kernel which will be - * loaded for dumping. - */ -#ifdef CONFIG_X86_32 -# define CRASH_ADDR_LOW_MAX SZ_512M -# define CRASH_ADDR_HIGH_MAX SZ_512M -#else -# define CRASH_ADDR_LOW_MAX SZ_4G -# define CRASH_ADDR_HIGH_MAX SZ_64T -#endif - -static int __init reserve_crashkernel_low(void) +static void __init arch_reserve_crashkernel(void) { -#ifdef CONFIG_X86_64 - unsigned long long base, low_base = 0, low_size = 0; - unsigned long low_mem_limit; - int ret; - - low_mem_limit = min(memblock_phys_mem_size(), CRASH_ADDR_LOW_MAX); - - /* crashkernel=Y,low */ - ret = parse_crashkernel_low(boot_command_line, low_mem_limit, &low_size, &base); - if (ret) { - /* - * two parts from kernel/dma/swiotlb.c: - * -swiotlb size: user-specified with swiotlb= or default. - * - * -swiotlb overflow buffer: now hardcoded to 32k. We round it - * to 8M for other buffers that may need to stay low too. Also - * make sure we allocate enough extra low memory so that we - * don't run out of DMA buffers for 32-bit devices. - */ - low_size = max(swiotlb_size_or_default() + (8UL << 20), 256UL << 20); - } else { - /* passed with crashkernel=0,low ? */ - if (!low_size) - return 0; - } - - low_base = memblock_phys_alloc_range(low_size, CRASH_ALIGN, 0, CRASH_ADDR_LOW_MAX); - if (!low_base) { - pr_err("Cannot reserve %ldMB crashkernel low memory, please try smaller size.\n", - (unsigned long)(low_size >> 20)); - return -ENOMEM; - } - - pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (low RAM limit: %ldMB)\n", - (unsigned long)(low_size >> 20), - (unsigned long)(low_base >> 20), - (unsigned long)(low_mem_limit >> 20)); - - crashk_low_res.start = low_base; - crashk_low_res.end = low_base + low_size - 1; - insert_resource(&iomem_resource, &crashk_low_res); -#endif - return 0; -} - -static void __init reserve_crashkernel(void) -{ - unsigned long long crash_size, crash_base, total_mem; + unsigned long long crash_base, crash_size, low_size = 0; + char *cmdline = boot_command_line; bool high = false; int ret; if (!IS_ENABLED(CONFIG_KEXEC_CORE)) return; - total_mem = memblock_phys_mem_size(); - - /* crashkernel=XM */ - ret = parse_crashkernel(boot_command_line, total_mem, - &crash_size, &crash_base, NULL, NULL); - if (ret != 0 || crash_size <= 0) { - /* crashkernel=X,high */ - ret = parse_crashkernel_high(boot_command_line, total_mem, - &crash_size, &crash_base); - if (ret != 0 || crash_size <= 0) - return; - high = true; - } + ret = parse_crashkernel(cmdline, memblock_phys_mem_size(), + &crash_size, &crash_base, + &low_size, &high); + if (ret) + return; if (xen_pv_domain()) { pr_info("Ignoring crashkernel for a Xen PV domain\n"); return; } - /* 0 means: find the address automatically */ - if (!crash_base) { - /* - * Set CRASH_ADDR_LOW_MAX upper bound for crash memory, - * crashkernel=x,high reserves memory over 4G, also allocates - * 256M extra low memory for DMA buffers and swiotlb. - * But the extra memory is not required for all machines. - * So try low memory first and fall back to high memory - * unless "crashkernel=size[KMG],high" is specified. - */ - if (!high) - crash_base = memblock_phys_alloc_range(crash_size, - CRASH_ALIGN, CRASH_ALIGN, - CRASH_ADDR_LOW_MAX); - if (!crash_base) - crash_base = memblock_phys_alloc_range(crash_size, - CRASH_ALIGN, CRASH_ALIGN, - CRASH_ADDR_HIGH_MAX); - if (!crash_base) { - pr_info("crashkernel reservation failed - No suitable area found.\n"); - return; - } - } else { - unsigned long long start; - - start = memblock_phys_alloc_range(crash_size, SZ_1M, crash_base, - crash_base + crash_size); - if (start != crash_base) { - pr_info("crashkernel reservation failed - memory is in use.\n"); - return; - } - } - - if (crash_base >= (1ULL << 32) && reserve_crashkernel_low()) { - memblock_phys_free(crash_base, crash_size); - return; - } - - pr_info("Reserving %ldMB of memory at %ldMB for crashkernel (System RAM: %ldMB)\n", - (unsigned long)(crash_size >> 20), - (unsigned long)(crash_base >> 20), - (unsigned long)(total_mem >> 20)); - - crashk_res.start = crash_base; - crashk_res.end = crash_base + crash_size - 1; - insert_resource(&iomem_resource, &crashk_res); + reserve_crashkernel_generic(cmdline, crash_size, crash_base, + low_size, high); } static struct resource standard_io_resources[] = { @@ -1228,7 +1102,7 @@ void __init setup_arch(char **cmdline_p) * Reserve memory for crash kernel after SRAT is parsed so that it * won't consume hotpluggable memory. */ - reserve_crashkernel(); + arch_reserve_crashkernel(); memblock_find_dma_reserve(); From fdc268232dbbae8d3acdc5536370b53327be3bb0 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 14 Sep 2023 11:31:40 +0800 Subject: [PATCH 29/64] arm64: kdump: use generic interface to simplify crashkernel reservation With the help of newly changed function parse_crashkernel() and generic reserve_crashkernel_generic(), crashkernel reservation can be simplified by steps: 1) Add a new header file , and define CRASH_ALIGN, CRASH_ADDR_LOW_MAX, CRASH_ADDR_HIGH_MAX and DEFAULT_CRASH_KERNEL_LOW_SIZE in ; 2) Add arch_reserve_crashkernel() to call parse_crashkernel() and reserve_crashkernel_generic(); 3) Add ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION Kconfig in arch/arm64/Kconfig. The old reserve_crashkernel_low() and reserve_crashkernel() can be removed. Link: https://lkml.kernel.org/r/20230914033142.676708-8-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Zhen Lei Cc: Catalin Marinas Cc: Chen Jiahao Signed-off-by: Andrew Morton --- arch/arm64/Kconfig | 3 + arch/arm64/include/asm/crash_core.h | 10 ++ arch/arm64/mm/init.c | 140 ++-------------------------- 3 files changed, 21 insertions(+), 132 deletions(-) create mode 100644 arch/arm64/include/asm/crash_core.h diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index b10515c0200b..e7d374d994ad 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1483,6 +1483,9 @@ config ARCH_DEFAULT_KEXEC_IMAGE_VERIFY_SIG config ARCH_SUPPORTS_CRASH_DUMP def_bool y +config ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION + def_bool CRASH_CORE + config TRANS_TABLE def_bool y depends on HIBERNATION || KEXEC_CORE diff --git a/arch/arm64/include/asm/crash_core.h b/arch/arm64/include/asm/crash_core.h new file mode 100644 index 000000000000..9f5c8d339f44 --- /dev/null +++ b/arch/arm64/include/asm/crash_core.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _ARM64_CRASH_CORE_H +#define _ARM64_CRASH_CORE_H + +/* Current arm64 boot protocol requires 2MB alignment */ +#define CRASH_ALIGN SZ_2M + +#define CRASH_ADDR_LOW_MAX arm64_dma_phys_limit +#define CRASH_ADDR_HIGH_MAX (PHYS_MASK + 1) +#endif diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 801c59c39a8f..f2bf32e19371 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -64,15 +64,6 @@ EXPORT_SYMBOL(memstart_addr); */ phys_addr_t __ro_after_init arm64_dma_phys_limit; -/* Current arm64 boot protocol requires 2MB alignment */ -#define CRASH_ALIGN SZ_2M - -#define CRASH_ADDR_LOW_MAX arm64_dma_phys_limit -#define CRASH_ADDR_HIGH_MAX (PHYS_MASK + 1) -#define CRASH_HIGH_SEARCH_BASE SZ_4G - -#define DEFAULT_CRASH_KERNEL_LOW_SIZE (128UL << 20) - /* * To make optimal use of block mappings when laying out the linear * mapping, round down the base of physical memory to a size that can @@ -100,140 +91,25 @@ phys_addr_t __ro_after_init arm64_dma_phys_limit; #define ARM64_MEMSTART_ALIGN (1UL << ARM64_MEMSTART_SHIFT) #endif -static int __init reserve_crashkernel_low(unsigned long long low_size) +static void __init arch_reserve_crashkernel(void) { - unsigned long long low_base; - - low_base = memblock_phys_alloc_range(low_size, CRASH_ALIGN, 0, CRASH_ADDR_LOW_MAX); - if (!low_base) { - pr_err("cannot allocate crashkernel low memory (size:0x%llx).\n", low_size); - return -ENOMEM; - } - - pr_info("crashkernel low memory reserved: 0x%08llx - 0x%08llx (%lld MB)\n", - low_base, low_base + low_size, low_size >> 20); - - crashk_low_res.start = low_base; - crashk_low_res.end = low_base + low_size - 1; - insert_resource(&iomem_resource, &crashk_low_res); - - return 0; -} - -/* - * reserve_crashkernel() - reserves memory for crash kernel - * - * This function reserves memory area given in "crashkernel=" kernel command - * line parameter. The memory reserved is used by dump capture kernel when - * primary kernel is crashing. - */ -static void __init reserve_crashkernel(void) -{ - unsigned long long crash_low_size = 0, search_base = 0; - unsigned long long crash_max = CRASH_ADDR_LOW_MAX; + unsigned long long low_size = 0; unsigned long long crash_base, crash_size; char *cmdline = boot_command_line; - bool fixed_base = false; bool high = false; int ret; if (!IS_ENABLED(CONFIG_KEXEC_CORE)) return; - /* crashkernel=X[@offset] */ ret = parse_crashkernel(cmdline, memblock_phys_mem_size(), - &crash_size, &crash_base, NULL, NULL); - if (ret == -ENOENT) { - ret = parse_crashkernel_high(cmdline, 0, &crash_size, &crash_base); - if (ret || !crash_size) - return; - - /* - * crashkernel=Y,low can be specified or not, but invalid value - * is not allowed. - */ - ret = parse_crashkernel_low(cmdline, 0, &crash_low_size, &crash_base); - if (ret == -ENOENT) - crash_low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE; - else if (ret) - return; - - search_base = CRASH_HIGH_SEARCH_BASE; - crash_max = CRASH_ADDR_HIGH_MAX; - high = true; - } else if (ret || !crash_size) { - /* The specified value is invalid */ + &crash_size, &crash_base, + &low_size, &high); + if (ret) return; - } - crash_size = PAGE_ALIGN(crash_size); - - /* User specifies base address explicitly. */ - if (crash_base) { - fixed_base = true; - search_base = crash_base; - crash_max = crash_base + crash_size; - } - -retry: - crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN, - search_base, crash_max); - if (!crash_base) { - /* - * For crashkernel=size[KMG]@offset[KMG], print out failure - * message if can't reserve the specified region. - */ - if (fixed_base) { - pr_warn("crashkernel reservation failed - memory is in use.\n"); - return; - } - - /* - * For crashkernel=size[KMG], if the first attempt was for - * low memory, fall back to high memory, the minimum required - * low memory will be reserved later. - */ - if (!high && crash_max == CRASH_ADDR_LOW_MAX) { - crash_max = CRASH_ADDR_HIGH_MAX; - search_base = CRASH_ADDR_LOW_MAX; - crash_low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE; - goto retry; - } - - /* - * For crashkernel=size[KMG],high, if the first attempt was - * for high memory, fall back to low memory. - */ - if (high && crash_max == CRASH_ADDR_HIGH_MAX) { - crash_max = CRASH_ADDR_LOW_MAX; - search_base = 0; - goto retry; - } - pr_warn("cannot allocate crashkernel (size:0x%llx)\n", - crash_size); - return; - } - - if ((crash_base >= CRASH_ADDR_LOW_MAX) && crash_low_size && - reserve_crashkernel_low(crash_low_size)) { - memblock_phys_free(crash_base, crash_size); - return; - } - - pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n", - crash_base, crash_base + crash_size, crash_size >> 20); - - /* - * The crashkernel memory will be removed from the kernel linear - * map. Inform kmemleak so that it won't try to access it. - */ - kmemleak_ignore_phys(crash_base); - if (crashk_low_res.end) - kmemleak_ignore_phys(crashk_low_res.start); - - crashk_res.start = crash_base; - crashk_res.end = crash_base + crash_size - 1; - insert_resource(&iomem_resource, &crashk_res); + reserve_crashkernel_generic(cmdline, crash_size, crash_base, + low_size, high); } /* @@ -479,7 +355,7 @@ void __init bootmem_init(void) * request_standard_resources() depends on crashkernel's memory being * reserved, so do it here. */ - reserve_crashkernel(); + arch_reserve_crashkernel(); memblock_dump_all(); } From 39365395046fe74bebf6f70e86439f48ba56a75e Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 14 Sep 2023 11:31:41 +0800 Subject: [PATCH 30/64] riscv: kdump: use generic interface to simplify crashkernel reservation With the help of newly changed function parse_crashkernel() and generic reserve_crashkernel_generic(), crashkernel reservation can be simplified by steps: 1) Add a new header file , and define CRASH_ALIGN, CRASH_ADDR_LOW_MAX, CRASH_ADDR_HIGH_MAX and DEFAULT_CRASH_KERNEL_LOW_SIZE in ; 2) Add arch_reserve_crashkernel() to call parse_crashkernel() and reserve_crashkernel_generic(); 3) Add ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION Kconfig in arch/riscv/Kconfig. The old reserve_crashkernel_low() and reserve_crashkernel() can be removed. [chenjiahao16@huawei.com: fix crashkernel reserving problem on RISC-V] Link: https://lkml.kernel.org/r/20230925024333.730964-1-chenjiahao16@huawei.com Link: https://lkml.kernel.org/r/20230914033142.676708-9-bhe@redhat.com Signed-off-by: Baoquan He Signed-off-by: Chen Jiahao Cc: Catalin Marinas Cc: Chen Jiahao Cc: Zhen Lei Signed-off-by: Andrew Morton --- arch/riscv/Kconfig | 3 + arch/riscv/include/asm/crash_core.h | 11 +++ arch/riscv/include/asm/processor.h | 2 + arch/riscv/kernel/setup.c | 13 --- arch/riscv/mm/init.c | 141 +++------------------------- 5 files changed, 27 insertions(+), 143 deletions(-) create mode 100644 arch/riscv/include/asm/crash_core.h diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index d607ab0f7c6d..25474f8c12b7 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -694,6 +694,9 @@ config ARCH_SUPPORTS_KEXEC_PURGATORY config ARCH_SUPPORTS_CRASH_DUMP def_bool y +config ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION + def_bool CRASH_CORE + config COMPAT bool "Kernel support for 32-bit U-mode" default 64BIT diff --git a/arch/riscv/include/asm/crash_core.h b/arch/riscv/include/asm/crash_core.h new file mode 100644 index 000000000000..e1874b23feaf --- /dev/null +++ b/arch/riscv/include/asm/crash_core.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _RISCV_CRASH_CORE_H +#define _RISCV_CRASH_CORE_H + +#define CRASH_ALIGN PMD_SIZE + +#define CRASH_ADDR_LOW_MAX dma32_phys_limit +#define CRASH_ADDR_HIGH_MAX memblock_end_of_DRAM() + +extern phys_addr_t memblock_end_of_DRAM(void); +#endif diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h index 3e23e1786d05..441da1839c94 100644 --- a/arch/riscv/include/asm/processor.h +++ b/arch/riscv/include/asm/processor.h @@ -116,6 +116,8 @@ static inline void wait_for_interrupt(void) __asm__ __volatile__ ("wfi"); } +extern phys_addr_t dma32_phys_limit; + struct device_node; int riscv_of_processor_hartid(struct device_node *node, unsigned long *hartid); int riscv_early_of_processor_hartid(struct device_node *node, unsigned long *hartid); diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index e600aab116a4..aac853ae4eb7 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -173,19 +173,6 @@ static void __init init_resources(void) if (ret < 0) goto error; -#ifdef CONFIG_KEXEC_CORE - if (crashk_res.start != crashk_res.end) { - ret = add_resource(&iomem_resource, &crashk_res); - if (ret < 0) - goto error; - } - if (crashk_low_res.start != crashk_low_res.end) { - ret = add_resource(&iomem_resource, &crashk_low_res); - if (ret < 0) - goto error; - } -#endif - #ifdef CONFIG_CRASH_DUMP if (elfcorehdr_size > 0) { elfcorehdr_res.start = elfcorehdr_addr; diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 9fe448900059..d9a4e8702864 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -65,7 +65,7 @@ extern char _start[]; void *_dtb_early_va __initdata; uintptr_t _dtb_early_pa __initdata; -static phys_addr_t dma32_phys_limit __initdata; +phys_addr_t dma32_phys_limit __initdata; static void __init zone_sizes_init(void) { @@ -1333,28 +1333,6 @@ static inline void setup_vm_final(void) } #endif /* CONFIG_MMU */ -/* Reserve 128M low memory by default for swiotlb buffer */ -#define DEFAULT_CRASH_KERNEL_LOW_SIZE (128UL << 20) - -static int __init reserve_crashkernel_low(unsigned long long low_size) -{ - unsigned long long low_base; - - low_base = memblock_phys_alloc_range(low_size, PMD_SIZE, 0, dma32_phys_limit); - if (!low_base) { - pr_err("cannot allocate crashkernel low memory (size:0x%llx).\n", low_size); - return -ENOMEM; - } - - pr_info("crashkernel low memory reserved: 0x%016llx - 0x%016llx (%lld MB)\n", - low_base, low_base + low_size, low_size >> 20); - - crashk_low_res.start = low_base; - crashk_low_res.end = low_base + low_size - 1; - - return 0; -} - /* * reserve_crashkernel() - reserves memory for crash kernel * @@ -1362,122 +1340,25 @@ static int __init reserve_crashkernel_low(unsigned long long low_size) * line parameter. The memory reserved is used by dump capture kernel when * primary kernel is crashing. */ -static void __init reserve_crashkernel(void) +static void __init arch_reserve_crashkernel(void) { - unsigned long long crash_base = 0; - unsigned long long crash_size = 0; - unsigned long long crash_low_size = 0; - unsigned long search_start = memblock_start_of_DRAM(); - unsigned long search_end = (unsigned long)dma32_phys_limit; + unsigned long long low_size = 0; + unsigned long long crash_base, crash_size; char *cmdline = boot_command_line; - bool fixed_base = false; bool high = false; - - int ret = 0; + int ret; if (!IS_ENABLED(CONFIG_KEXEC_CORE)) return; - /* - * Don't reserve a region for a crash kernel on a crash kernel - * since it doesn't make much sense and we have limited memory - * resources. - */ - if (is_kdump_kernel()) { - pr_info("crashkernel: ignoring reservation request\n"); - return; - } ret = parse_crashkernel(cmdline, memblock_phys_mem_size(), - &crash_size, &crash_base, NULL, NULL); - if (ret == -ENOENT) { - /* Fallback to crashkernel=X,[high,low] */ - ret = parse_crashkernel_high(cmdline, 0, &crash_size, &crash_base); - if (ret || !crash_size) - return; - - /* - * crashkernel=Y,low is valid only when crashkernel=X,high - * is passed. - */ - ret = parse_crashkernel_low(cmdline, 0, &crash_low_size, &crash_base); - if (ret == -ENOENT) - crash_low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE; - else if (ret) - return; - - search_start = (unsigned long)dma32_phys_limit; - search_end = memblock_end_of_DRAM(); - high = true; - } else if (ret || !crash_size) { - /* Invalid argument value specified */ + &crash_size, &crash_base, + &low_size, &high); + if (ret) return; - } - crash_size = PAGE_ALIGN(crash_size); - - if (crash_base) { - fixed_base = true; - search_start = crash_base; - search_end = crash_base + crash_size; - } - - /* - * Current riscv boot protocol requires 2MB alignment for - * RV64 and 4MB alignment for RV32 (hugepage size) - * - * Try to alloc from 32bit addressible physical memory so that - * swiotlb can work on the crash kernel. - */ - crash_base = memblock_phys_alloc_range(crash_size, PMD_SIZE, - search_start, search_end); - if (crash_base == 0) { - /* - * For crashkernel=size[KMG]@offset[KMG], print out failure - * message if can't reserve the specified region. - */ - if (fixed_base) { - pr_warn("crashkernel: allocating failed with given size@offset\n"); - return; - } - - if (high) { - /* - * For crashkernel=size[KMG],high, if the first attempt was - * for high memory, fall back to low memory. - */ - search_start = memblock_start_of_DRAM(); - search_end = (unsigned long)dma32_phys_limit; - } else { - /* - * For crashkernel=size[KMG], if the first attempt was for - * low memory, fall back to high memory, the minimum required - * low memory will be reserved later. - */ - search_start = (unsigned long)dma32_phys_limit; - search_end = memblock_end_of_DRAM(); - crash_low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE; - } - - crash_base = memblock_phys_alloc_range(crash_size, PMD_SIZE, - search_start, search_end); - if (crash_base == 0) { - pr_warn("crashkernel: couldn't allocate %lldKB\n", - crash_size >> 10); - return; - } - } - - if ((crash_base >= dma32_phys_limit) && crash_low_size && - reserve_crashkernel_low(crash_low_size)) { - memblock_phys_free(crash_base, crash_size); - return; - } - - pr_info("crashkernel: reserved 0x%016llx - 0x%016llx (%lld MB)\n", - crash_base, crash_base + crash_size, crash_size >> 20); - - crashk_res.start = crash_base; - crashk_res.end = crash_base + crash_size - 1; + reserve_crashkernel_generic(cmdline, crash_size, crash_base, + low_size, high); } void __init paging_init(void) @@ -1495,7 +1376,7 @@ void __init misc_mem_init(void) arch_numa_init(); sparse_init(); zone_sizes_init(); - reserve_crashkernel(); + arch_reserve_crashkernel(); memblock_dump_all(); } From c37e56cac3d62c69f093904afbc58fc428484d14 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 14 Sep 2023 11:31:42 +0800 Subject: [PATCH 31/64] crash_core.c: remove unneeded functions So far, nobody calls functions parse_crashkernel_high() and parse_crashkernel_low(), remove both of them. Link: https://lkml.kernel.org/r/20230914033142.676708-10-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Zhen Lei Cc: Catalin Marinas Cc: Chen Jiahao Signed-off-by: Andrew Morton --- include/linux/crash_core.h | 4 ---- kernel/crash_core.c | 18 ------------------ 2 files changed, 22 deletions(-) diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index 3c735a7e33fb..3426f6eef60b 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -96,10 +96,6 @@ void final_note(Elf_Word *buf); int __init parse_crashkernel(char *cmdline, unsigned long long system_ram, unsigned long long *crash_size, unsigned long long *crash_base, unsigned long long *low_size, bool *high); -int parse_crashkernel_high(char *cmdline, unsigned long long system_ram, - unsigned long long *crash_size, unsigned long long *crash_base); -int parse_crashkernel_low(char *cmdline, unsigned long long system_ram, - unsigned long long *crash_size, unsigned long long *crash_base); #ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION #ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE diff --git a/kernel/crash_core.c b/kernel/crash_core.c index fa8808c4f00e..efe87d501c8c 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -350,24 +350,6 @@ int __init parse_crashkernel(char *cmdline, return ret; } -int __init parse_crashkernel_high(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, - suffix_tbl[SUFFIX_HIGH]); -} - -int __init parse_crashkernel_low(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, - suffix_tbl[SUFFIX_LOW]); -} - /* * Add a dummy early_param handler to mark crashkernel= as a known command line * parameter and suppress incorrect warnings in init/main.c. From 5097a69d676f7e562240dbe81c21b1c62aaf5950 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sat, 16 Sep 2023 21:21:31 +0300 Subject: [PATCH 32/64] extract and use FILE_LINE macro Extract nifty FILE_LINE useful for printk style debugging: printk("%s\n", FILE_LINE); It should not be used en mass probably because __FILE__ string literals can be merged while FILE_LINE's won't. But for debugging it is what the doctor ordered. Don't add leading and trailing underscores, they're painful to type. Trust me, I've tried both versions. Link: https://lkml.kernel.org/r/ebf12ac4-5a61-4b12-b8b0-1253eb371332@p183 Signed-off-by: Alexey Dobriyan Cc: Kees Cook Cc: Takashi Iwai Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- include/linux/fortify-string.h | 2 +- include/linux/stringify.h | 2 ++ include/linux/timer.h | 3 +-- sound/pci/asihpi/hpidebug.h | 9 ++++----- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h index da51a83b2829..442ee9170259 100644 --- a/include/linux/fortify-string.h +++ b/include/linux/fortify-string.h @@ -643,7 +643,7 @@ __FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size, __q_size_field, #op), \ #op ": detected field-spanning write (size %zu) of single %s (size %zu)\n", \ __fortify_size, \ - "field \"" #p "\" at " __FILE__ ":" __stringify(__LINE__), \ + "field \"" #p "\" at " FILE_LINE, \ __p_size_field); \ __underlying_##op(p, q, __fortify_size); \ }) diff --git a/include/linux/stringify.h b/include/linux/stringify.h index 841cec8ed525..0e84cbe65270 100644 --- a/include/linux/stringify.h +++ b/include/linux/stringify.h @@ -9,4 +9,6 @@ #define __stringify_1(x...) #x #define __stringify(x...) __stringify_1(x) +#define FILE_LINE __FILE__ ":" __stringify(__LINE__) + #endif /* !__LINUX_STRINGIFY_H */ diff --git a/include/linux/timer.h b/include/linux/timer.h index 9162f275819a..26a545bb0153 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -77,8 +77,7 @@ struct timer_list { .entry = { .next = TIMER_ENTRY_STATIC }, \ .function = (_function), \ .flags = (_flags), \ - __TIMER_LOCKDEP_MAP_INITIALIZER( \ - __FILE__ ":" __stringify(__LINE__)) \ + __TIMER_LOCKDEP_MAP_INITIALIZER(FILE_LINE) \ } #define DEFINE_TIMER(_name, _function) \ diff --git a/sound/pci/asihpi/hpidebug.h b/sound/pci/asihpi/hpidebug.h index c24ed69eb743..c6dfc229213d 100644 --- a/sound/pci/asihpi/hpidebug.h +++ b/sound/pci/asihpi/hpidebug.h @@ -29,16 +29,15 @@ enum { HPI_DEBUG_LEVEL_ERROR = 0, /* always log errors */ the start of each message, eg see linux kernel hpios.h */ #ifdef SOURCEFILE_NAME +#undef FILE_LINE #define FILE_LINE SOURCEFILE_NAME ":" __stringify(__LINE__) " " -#else -#define FILE_LINE __FILE__ ":" __stringify(__LINE__) " " #endif #define HPI_DEBUG_ASSERT(expression) \ do { \ if (!(expression)) { \ printk(KERN_ERR FILE_LINE \ - "ASSERT " __stringify(expression)); \ + " ASSERT " __stringify(expression)); \ } \ } while (0) @@ -46,7 +45,7 @@ enum { HPI_DEBUG_LEVEL_ERROR = 0, /* always log errors */ do { \ if (hpi_debug_level >= HPI_DEBUG_LEVEL_##level) { \ printk(HPI_DEBUG_FLAG_##level \ - FILE_LINE __VA_ARGS__); \ + FILE_LINE " " __VA_ARGS__); \ } \ } while (0) @@ -70,7 +69,7 @@ void hpi_debug_data(u16 *pdata, u32 len); do { \ if (hpi_debug_level >= HPI_DEBUG_LEVEL_##level) { \ hpi_debug_message(phm, HPI_DEBUG_FLAG_##level \ - FILE_LINE __stringify(level)); \ + FILE_LINE " " __stringify(level)); \ } \ } while (0) From 9bf2850c9170b52a6ab052085feced55effa78ef Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 16 Sep 2023 17:50:11 +0200 Subject: [PATCH 33/64] kstrtox: remove strtobool() The conversion from strtobool() to kstrtobool() is completed. So strtobool() can now be removed. Link: https://lkml.kernel.org/r/87e3cc2547df174cd5af1fadbf866be4ef9e8e45.1694878151.git.christophe.jaillet@wanadoo.fr Signed-off-by: Christophe JAILLET Reviewed-by: Andy Shevchenko Signed-off-by: Andrew Morton --- include/linux/kstrtox.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/include/linux/kstrtox.h b/include/linux/kstrtox.h index 529974e22ea7..7fcf29a4e0de 100644 --- a/include/linux/kstrtox.h +++ b/include/linux/kstrtox.h @@ -147,9 +147,4 @@ extern long simple_strtol(const char *,char **,unsigned int); extern unsigned long long simple_strtoull(const char *,char **,unsigned int); extern long long simple_strtoll(const char *,char **,unsigned int); -static inline int strtobool(const char *s, bool *res) -{ - return kstrtobool(s, res); -} - #endif /* _LINUX_KSTRTOX_H */ From a1cfa251f8d99b8a446c395a84ec9c537d3d8c45 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 22 Sep 2023 10:49:30 -0700 Subject: [PATCH 34/64] ocfs2: annotate struct ocfs2_replay_map with __counted_by Prepare for the coming implementation by GCC and Clang of the __counted_by attribute. Flexible array members annotated with __counted_by can have their accesses bounds-checked at run-time checking via CONFIG_UBSAN_BOUNDS (for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family functions). As found with Coccinelle[1], add __counted_by for struct ocfs2_replay_map. [1] https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci Link: https://lkml.kernel.org/r/20230922174925.work.293-kees@kernel.org Signed-off-by: Kees Cook Reviewed-by: Gustavo A. R. Silva Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Tom Rix Signed-off-by: Andrew Morton --- fs/ocfs2/journal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index ce215565d061..604fea3a26ff 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -90,7 +90,7 @@ enum ocfs2_replay_state { struct ocfs2_replay_map { unsigned int rm_slots; enum ocfs2_replay_state rm_state; - unsigned char rm_replay_slots[]; + unsigned char rm_replay_slots[] __counted_by(rm_slots); }; static void ocfs2_replay_map_set_state(struct ocfs2_super *osb, int state) From a287116af12b33d8a03a281fce08af5acaac6ade Mon Sep 17 00:00:00 2001 From: Li kunyu Date: Tue, 26 Sep 2023 10:24:10 +0800 Subject: [PATCH 35/64] kernel/signal: remove unnecessary NULL values from ucounts ucounts is assigned first, so it does not need to initialize the assignment. Link: https://lkml.kernel.org/r/20230926022410.4280-1-kunyu@nfschina.com Signed-off-by: Li kunyu Acked-by: Oleg Nesterov Signed-off-by: Andrew Morton --- kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/signal.c b/kernel/signal.c index ccfc3ded5672..edaf39382d21 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -415,7 +415,7 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags, int override_rlimit, const unsigned int sigqueue_flags) { struct sigqueue *q = NULL; - struct ucounts *ucounts = NULL; + struct ucounts *ucounts; long sigpending; /* From 80fcac55385ccb710d33a20dc1caaef29bd5a921 Mon Sep 17 00:00:00 2001 From: David Laight Date: Mon, 18 Sep 2023 08:16:30 +0000 Subject: [PATCH 36/64] minmax: add umin(a, b) and umax(a, b) Patch series "minmax: Relax type checks in min() and max()", v4. The min() (etc) functions in minmax.h require that the arguments have exactly the same types. However when the type check fails, rather than look at the types and fix the type of a variable/constant, everyone seems to jump on min_t(). In reality min_t() ought to be rare - when something unusual is being done, not normality. The orginal min() (added in 2.4.9) replaced several inline functions and included the type - so matched the implicit casting of the function call. This was renamed min_t() in 2.4.10 and the current min() added. There is no actual indication that the conversion of negatve values to large unsigned values has ever been an actual problem. A quick grep shows 5734 min() and 4597 min_t(). Having the casts on almost half of the calls shows that something is clearly wrong. If the wrong type is picked (and it is far too easy to pick the type of the result instead of the larger input) then significant bits can get discarded. Pretty much the worst example is in the derived clamp_val(), consider: unsigned char x = 200u; y = clamp_val(x, 10u, 300u); I also suspect that many of the min_t(u16, ...) are actually wrong. For example copy_data() in printk_ringbuffer.c contains: data_size = min_t(u16, buf_size, len); Here buf_size is 'unsigned int' and len 'u16', pass a 64k buffer (can you prove that doesn't happen?) and no data is returned. Apparantly it did - and has since been fixed. The only reason that most of the min_t() are 'fine' is that pretty much all the values in the kernel are between 0 and INT_MAX. Patch 1 adds umin(), this uses integer promotions to convert both arguments to 'unsigned long long'. It can be used to compare a signed type that is known to contain a non-negative value with an unsigned type. The compiler typically optimises it all away. Added first so that it can be referred to in patch 2. Patch 2 replaces the 'same type' check with a 'same signedness' one. This makes min(unsigned_int_var, sizeof()) be ok. The error message is also improved and will contain the expanded form of both arguments (useful for seeing how constants are defined). Patch 3 just fixes some whitespace. Patch 4 allows comparisons of 'unsigned char' and 'unsigned short' to signed types. The integer promotion rules convert them both to 'signed int' prior to the comparison so they can never cause a negative value be converted to a large positive one. Patch 5 (rewritted for v4) allows comparisons of unsigned values against non-negative constant integer expressions. This makes min(unsigned_int_var, 4) be ok. The only common case that is still errored is the comparison of signed values against unsigned constant integer expressions below __INT_MAX__. Typcally min(int_val, sizeof (foo)), the real fix for this is casting the constant: min(int_var, (int)sizeof (foo)). With all the patches applied pretty much all the min_t() could be replaced by min(), and most of the rest by umin(). However they all need careful inspection due to code like: sz = min_t(unsigned char, sz - 1, LIM - 1) + 1; which converts 0 to LIM. This patch (of 6): umin() and umax() can be used when min()/max() errors a signed v unsigned compare when the signed value is known to be non-negative. Unlike min_t(some_unsigned_type, a, b) umin() will never mask off high bits if an inappropriate type is selected. The '+ 0u + 0ul + 0ull' may look strange. The '+ 0u' is needed for 'signed int' on 64bit systems. The '+ 0ul' is needed for 'signed long' on 32bit systems. The '+ 0ull' is needed for 'signed long long'. Link: https://lkml.kernel.org/r/b97faef60ad24922b530241c5d7c933c@AcuMS.aculab.com Link: https://lkml.kernel.org/r/41d93ca827a248698ec64bf57e0c05a5@AcuMS.aculab.com Signed-off-by: David Laight Cc: Andy Shevchenko Cc: Christoph Hellwig Cc: Jason A. Donenfeld Cc: Linus Torvalds Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/minmax.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/include/linux/minmax.h b/include/linux/minmax.h index ca69abd6151e..2a197f54fe05 100644 --- a/include/linux/minmax.h +++ b/include/linux/minmax.h @@ -76,6 +76,23 @@ */ #define max(x, y) __careful_cmp(x, y, >) +/** + * umin - return minimum of two non-negative values + * Signed types are zero extended to match a larger unsigned type. + * @x: first value + * @y: second value + */ +#define umin(x, y) \ + __careful_cmp((x) + 0u + 0ul + 0ull, (y) + 0u + 0ul + 0ull, <) + +/** + * umax - return maximum of two non-negative values + * @x: first value + * @y: second value + */ +#define umax(x, y) \ + __careful_cmp((x) + 0u + 0ul + 0ull, (y) + 0u + 0ul + 0ull, >) + /** * min3 - return minimum of three values * @x: first value From d03eba99f5bf7cbc6e2fdde3b6fa36954ad58e09 Mon Sep 17 00:00:00 2001 From: David Laight Date: Mon, 18 Sep 2023 08:17:15 +0000 Subject: [PATCH 37/64] minmax: allow min()/max()/clamp() if the arguments have the same signedness. The type-check in min()/max() is there to stop unexpected results if a negative value gets converted to a large unsigned value. However it also rejects 'unsigned int' v 'unsigned long' compares which are common and never problematc. Replace the 'same type' check with a 'same signedness' check. The new test isn't itself a compile time error, so use static_assert() to report the error and give a meaningful error message. Due to the way builtin_choose_expr() works detecting the error in the 'non-constant' side (where static_assert() can be used) also detects errors when the arguments are constant. Link: https://lkml.kernel.org/r/fe7e6c542e094bfca655abcd323c1c98@AcuMS.aculab.com Signed-off-by: David Laight Cc: Andy Shevchenko Cc: Christoph Hellwig Cc: Jason A. Donenfeld Cc: Linus Torvalds Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/minmax.h | 60 ++++++++++++++++++++++-------------------- 1 file changed, 32 insertions(+), 28 deletions(-) diff --git a/include/linux/minmax.h b/include/linux/minmax.h index 2a197f54fe05..8718fd71a793 100644 --- a/include/linux/minmax.h +++ b/include/linux/minmax.h @@ -12,9 +12,8 @@ * * - avoid multiple evaluations of the arguments (so side-effects like * "x++" happen only once) when non-constant. - * - perform strict type-checking (to generate warnings instead of - * nasty runtime surprises). See the "unnecessary" pointer comparison - * in __typecheck(). + * - perform signed v unsigned type-checking (to generate compile + * errors instead of nasty runtime surprises). * - retain result as a constant expressions when called with only * constant expressions (to avoid tripping VLA warnings in stack * allocation usage). @@ -22,23 +21,30 @@ #define __typecheck(x, y) \ (!!(sizeof((typeof(x) *)1 == (typeof(y) *)1))) -#define __no_side_effects(x, y) \ - (__is_constexpr(x) && __is_constexpr(y)) +/* is_signed_type() isn't a constexpr for pointer types */ +#define __is_signed(x) \ + __builtin_choose_expr(__is_constexpr(is_signed_type(typeof(x))), \ + is_signed_type(typeof(x)), 0) -#define __safe_cmp(x, y) \ - (__typecheck(x, y) && __no_side_effects(x, y)) +#define __types_ok(x, y) \ + (__is_signed(x) == __is_signed(y)) -#define __cmp(x, y, op) ((x) op (y) ? (x) : (y)) +#define __cmp_op_min < +#define __cmp_op_max > -#define __cmp_once(x, y, unique_x, unique_y, op) ({ \ +#define __cmp(op, x, y) ((x) __cmp_op_##op (y) ? (x) : (y)) + +#define __cmp_once(op, x, y, unique_x, unique_y) ({ \ typeof(x) unique_x = (x); \ typeof(y) unique_y = (y); \ - __cmp(unique_x, unique_y, op); }) + static_assert(__types_ok(x, y), \ + #op "(" #x ", " #y ") signedness error, fix types or consider u" #op "() before " #op "_t()"); \ + __cmp(op, unique_x, unique_y); }) -#define __careful_cmp(x, y, op) \ - __builtin_choose_expr(__safe_cmp(x, y), \ - __cmp(x, y, op), \ - __cmp_once(x, y, __UNIQUE_ID(__x), __UNIQUE_ID(__y), op)) +#define __careful_cmp(op, x, y) \ + __builtin_choose_expr(__is_constexpr((x) - (y)), \ + __cmp(op, x, y), \ + __cmp_once(op, x, y, __UNIQUE_ID(__x), __UNIQUE_ID(__y))) #define __clamp(val, lo, hi) \ ((val) >= (hi) ? (hi) : ((val) <= (lo) ? (lo) : (val))) @@ -47,17 +53,15 @@ typeof(val) unique_val = (val); \ typeof(lo) unique_lo = (lo); \ typeof(hi) unique_hi = (hi); \ + static_assert(__builtin_choose_expr(__is_constexpr((lo) > (hi)), \ + (lo) <= (hi), true), \ + "clamp() low limit " #lo " greater than high limit " #hi); \ + static_assert(__types_ok(val, lo), "clamp() 'lo' signedness error"); \ + static_assert(__types_ok(val, hi), "clamp() 'hi' signedness error"); \ __clamp(unique_val, unique_lo, unique_hi); }) -#define __clamp_input_check(lo, hi) \ - (BUILD_BUG_ON_ZERO(__builtin_choose_expr( \ - __is_constexpr((lo) > (hi)), (lo) > (hi), false))) - #define __careful_clamp(val, lo, hi) ({ \ - __clamp_input_check(lo, hi) + \ - __builtin_choose_expr(__typecheck(val, lo) && __typecheck(val, hi) && \ - __typecheck(hi, lo) && __is_constexpr(val) && \ - __is_constexpr(lo) && __is_constexpr(hi), \ + __builtin_choose_expr(__is_constexpr((val) - (lo) + (hi)), \ __clamp(val, lo, hi), \ __clamp_once(val, lo, hi, __UNIQUE_ID(__val), \ __UNIQUE_ID(__lo), __UNIQUE_ID(__hi))); }) @@ -67,14 +71,14 @@ * @x: first value * @y: second value */ -#define min(x, y) __careful_cmp(x, y, <) +#define min(x, y) __careful_cmp(min, x, y) /** * max - return maximum of two values of the same or compatible types * @x: first value * @y: second value */ -#define max(x, y) __careful_cmp(x, y, >) +#define max(x, y) __careful_cmp(max, x, y) /** * umin - return minimum of two non-negative values @@ -83,7 +87,7 @@ * @y: second value */ #define umin(x, y) \ - __careful_cmp((x) + 0u + 0ul + 0ull, (y) + 0u + 0ul + 0ull, <) + __careful_cmp(min, (x) + 0u + 0ul + 0ull, (y) + 0u + 0ul + 0ull) /** * umax - return maximum of two non-negative values @@ -91,7 +95,7 @@ * @y: second value */ #define umax(x, y) \ - __careful_cmp((x) + 0u + 0ul + 0ull, (y) + 0u + 0ul + 0ull, >) + __careful_cmp(max, (x) + 0u + 0ul + 0ull, (y) + 0u + 0ul + 0ull) /** * min3 - return minimum of three values @@ -143,7 +147,7 @@ * @x: first value * @y: second value */ -#define min_t(type, x, y) __careful_cmp((type)(x), (type)(y), <) +#define min_t(type, x, y) __careful_cmp(min, (type)(x), (type)(y)) /** * max_t - return maximum of two values, using the specified type @@ -151,7 +155,7 @@ * @x: first value * @y: second value */ -#define max_t(type, x, y) __careful_cmp((type)(x), (type)(y), >) +#define max_t(type, x, y) __careful_cmp(max, (type)(x), (type)(y)) /* * Do not check the array parameter using __must_be_array(). From f4b84b2ff851f01d0fac619eadef47eb41648534 Mon Sep 17 00:00:00 2001 From: David Laight Date: Mon, 18 Sep 2023 08:17:57 +0000 Subject: [PATCH 38/64] minmax: fix indentation of __cmp_once() and __clamp_once() Remove the extra indentation and align continuation markers. Link: https://lkml.kernel.org/r/bed41317a05c498ea0209eafbcab45a5@AcuMS.aculab.com Signed-off-by: David Laight Cc: Andy Shevchenko Cc: Christoph Hellwig Cc: Jason A. Donenfeld Cc: Linus Torvalds Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/minmax.h | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/include/linux/minmax.h b/include/linux/minmax.h index 8718fd71a793..c0e738eacefa 100644 --- a/include/linux/minmax.h +++ b/include/linux/minmax.h @@ -35,11 +35,11 @@ #define __cmp(op, x, y) ((x) __cmp_op_##op (y) ? (x) : (y)) #define __cmp_once(op, x, y, unique_x, unique_y) ({ \ - typeof(x) unique_x = (x); \ - typeof(y) unique_y = (y); \ - static_assert(__types_ok(x, y), \ - #op "(" #x ", " #y ") signedness error, fix types or consider u" #op "() before " #op "_t()"); \ - __cmp(op, unique_x, unique_y); }) + typeof(x) unique_x = (x); \ + typeof(y) unique_y = (y); \ + static_assert(__types_ok(x, y), \ + #op "(" #x ", " #y ") signedness error, fix types or consider u" #op "() before " #op "_t()"); \ + __cmp(op, unique_x, unique_y); }) #define __careful_cmp(op, x, y) \ __builtin_choose_expr(__is_constexpr((x) - (y)), \ @@ -49,16 +49,16 @@ #define __clamp(val, lo, hi) \ ((val) >= (hi) ? (hi) : ((val) <= (lo) ? (lo) : (val))) -#define __clamp_once(val, lo, hi, unique_val, unique_lo, unique_hi) ({ \ - typeof(val) unique_val = (val); \ - typeof(lo) unique_lo = (lo); \ - typeof(hi) unique_hi = (hi); \ - static_assert(__builtin_choose_expr(__is_constexpr((lo) > (hi)), \ - (lo) <= (hi), true), \ - "clamp() low limit " #lo " greater than high limit " #hi); \ - static_assert(__types_ok(val, lo), "clamp() 'lo' signedness error"); \ - static_assert(__types_ok(val, hi), "clamp() 'hi' signedness error"); \ - __clamp(unique_val, unique_lo, unique_hi); }) +#define __clamp_once(val, lo, hi, unique_val, unique_lo, unique_hi) ({ \ + typeof(val) unique_val = (val); \ + typeof(lo) unique_lo = (lo); \ + typeof(hi) unique_hi = (hi); \ + static_assert(__builtin_choose_expr(__is_constexpr((lo) > (hi)), \ + (lo) <= (hi), true), \ + "clamp() low limit " #lo " greater than high limit " #hi); \ + static_assert(__types_ok(val, lo), "clamp() 'lo' signedness error"); \ + static_assert(__types_ok(val, hi), "clamp() 'hi' signedness error"); \ + __clamp(unique_val, unique_lo, unique_hi); }) #define __careful_clamp(val, lo, hi) ({ \ __builtin_choose_expr(__is_constexpr((val) - (lo) + (hi)), \ From 4ead534fba42fc4fd41163297528d2aa731cd121 Mon Sep 17 00:00:00 2001 From: David Laight Date: Mon, 18 Sep 2023 08:18:40 +0000 Subject: [PATCH 39/64] minmax: allow comparisons of 'int' against 'unsigned char/short' Since 'unsigned char/short' get promoted to 'signed int' it is safe to compare them against an 'int' value. Link: https://lkml.kernel.org/r/8732ef5f809c47c28a7be47c938b28d4@AcuMS.aculab.com Signed-off-by: David Laight Cc: Andy Shevchenko Cc: Christoph Hellwig Cc: Jason A. Donenfeld Cc: Linus Torvalds Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/minmax.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/linux/minmax.h b/include/linux/minmax.h index c0e738eacefa..842c1db62ffe 100644 --- a/include/linux/minmax.h +++ b/include/linux/minmax.h @@ -26,8 +26,9 @@ __builtin_choose_expr(__is_constexpr(is_signed_type(typeof(x))), \ is_signed_type(typeof(x)), 0) -#define __types_ok(x, y) \ - (__is_signed(x) == __is_signed(y)) +#define __types_ok(x, y) \ + (__is_signed(x) == __is_signed(y) || \ + __is_signed((x) + 0) == __is_signed((y) + 0)) #define __cmp_op_min < #define __cmp_op_max > From 867046cc7027703f60a46339ffde91a1970f2901 Mon Sep 17 00:00:00 2001 From: David Laight Date: Mon, 18 Sep 2023 08:19:25 +0000 Subject: [PATCH 40/64] minmax: relax check to allow comparison between unsigned arguments and signed constants Allow (for example) min(unsigned_var, 20). The opposite min(signed_var, 20u) is still errored. Since a comparison between signed and unsigned never makes the unsigned value negative it is only necessary to adjust the __types_ok() test. Link: https://lkml.kernel.org/r/633b64e2f39e46bb8234809c5595b8c7@AcuMS.aculab.com Signed-off-by: David Laight Cc: Andy Shevchenko Cc: Christoph Hellwig Cc: Jason A. Donenfeld Cc: Linus Torvalds Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/minmax.h | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/include/linux/minmax.h b/include/linux/minmax.h index 842c1db62ffe..2ec559284a9f 100644 --- a/include/linux/minmax.h +++ b/include/linux/minmax.h @@ -10,13 +10,18 @@ /* * min()/max()/clamp() macros must accomplish three things: * - * - avoid multiple evaluations of the arguments (so side-effects like + * - Avoid multiple evaluations of the arguments (so side-effects like * "x++" happen only once) when non-constant. - * - perform signed v unsigned type-checking (to generate compile - * errors instead of nasty runtime surprises). - * - retain result as a constant expressions when called with only + * - Retain result as a constant expressions when called with only * constant expressions (to avoid tripping VLA warnings in stack * allocation usage). + * - Perform signed v unsigned type-checking (to generate compile + * errors instead of nasty runtime surprises). + * - Unsigned char/short are always promoted to signed int and can be + * compared against signed or unsigned arguments. + * - Unsigned arguments can be compared against non-negative signed constants. + * - Comparison of a signed argument against an unsigned constant fails + * even if the constant is below __INT_MAX__ and could be cast to int. */ #define __typecheck(x, y) \ (!!(sizeof((typeof(x) *)1 == (typeof(y) *)1))) @@ -26,9 +31,14 @@ __builtin_choose_expr(__is_constexpr(is_signed_type(typeof(x))), \ is_signed_type(typeof(x)), 0) -#define __types_ok(x, y) \ - (__is_signed(x) == __is_signed(y) || \ - __is_signed((x) + 0) == __is_signed((y) + 0)) +/* True for a non-negative signed int constant */ +#define __is_noneg_int(x) \ + (__builtin_choose_expr(__is_constexpr(x) && __is_signed(x), x, -1) >= 0) + +#define __types_ok(x, y) \ + (__is_signed(x) == __is_signed(y) || \ + __is_signed((x) + 0) == __is_signed((y) + 0) || \ + __is_noneg_int(x) || __is_noneg_int(y)) #define __cmp_op_min < #define __cmp_op_max > From 860a2e7fa4a186a78be904879f752858c96328ed Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 29 Sep 2023 19:30:18 +0300 Subject: [PATCH 41/64] proc: use initializer for clearing some buffers Save LOC by using dark magic of initialisation instead of memset(). Those buffer aren't passed to userspace directly so padding is not an issue. Link: https://lkml.kernel.org/r/3821d3a2-6e10-4629-b0d5-9519d828ab72@p183 Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton --- fs/proc/base.c | 16 ++++++---------- fs/proc/task_mmu.c | 11 +++-------- 2 files changed, 9 insertions(+), 18 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index c0e971cc6d41..601329f9629a 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1153,11 +1153,10 @@ err_unlock: static ssize_t oom_adj_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - char buffer[PROC_NUMBUF]; + char buffer[PROC_NUMBUF] = {}; int oom_adj; int err; - memset(buffer, 0, sizeof(buffer)); if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) { @@ -1213,11 +1212,10 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf, static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - char buffer[PROC_NUMBUF]; + char buffer[PROC_NUMBUF] = {}; int oom_score_adj; int err; - memset(buffer, 0, sizeof(buffer)); if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) { @@ -1358,13 +1356,13 @@ static ssize_t proc_fault_inject_write(struct file * file, const char __user * buf, size_t count, loff_t *ppos) { struct task_struct *task; - char buffer[PROC_NUMBUF]; + char buffer[PROC_NUMBUF] = {}; int make_it_fail; int rv; if (!capable(CAP_SYS_RESOURCE)) return -EPERM; - memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) @@ -1509,11 +1507,10 @@ sched_autogroup_write(struct file *file, const char __user *buf, { struct inode *inode = file_inode(file); struct task_struct *p; - char buffer[PROC_NUMBUF]; + char buffer[PROC_NUMBUF] = {}; int nice; int err; - memset(buffer, 0, sizeof(buffer)); if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) @@ -1666,10 +1663,9 @@ static ssize_t comm_write(struct file *file, const char __user *buf, { struct inode *inode = file_inode(file); struct task_struct *p; - char buffer[TASK_COMM_LEN]; + char buffer[TASK_COMM_LEN] = {}; const size_t maxlen = sizeof(buffer) - 1; - memset(buffer, 0, sizeof(buffer)); if (copy_from_user(buffer, buf, count > maxlen ? maxlen : count)) return -EFAULT; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 3dd5be96691b..d4d55d5bae51 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -849,9 +849,7 @@ static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss, static int show_smap(struct seq_file *m, void *v) { struct vm_area_struct *vma = v; - struct mem_size_stats mss; - - memset(&mss, 0, sizeof(mss)); + struct mem_size_stats mss = {}; smap_gather_stats(vma, &mss, 0); @@ -877,7 +875,7 @@ static int show_smap(struct seq_file *m, void *v) static int show_smaps_rollup(struct seq_file *m, void *v) { struct proc_maps_private *priv = m->private; - struct mem_size_stats mss; + struct mem_size_stats mss = {}; struct mm_struct *mm = priv->mm; struct vm_area_struct *vma; unsigned long vma_start = 0, last_vma_end = 0; @@ -893,8 +891,6 @@ static int show_smaps_rollup(struct seq_file *m, void *v) goto out_put_task; } - memset(&mss, 0, sizeof(mss)); - ret = mmap_read_lock_killable(mm); if (ret) goto out_put_mm; @@ -1246,14 +1242,13 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct task_struct *task; - char buffer[PROC_NUMBUF]; + char buffer[PROC_NUMBUF] = {}; struct mm_struct *mm; struct vm_area_struct *vma; enum clear_refs_types type; int itype; int rv; - memset(buffer, 0, sizeof(buffer)); if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) From ead5a727739fa63b94ccd281d848a503032444ee Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 29 Sep 2023 19:31:41 +0300 Subject: [PATCH 42/64] proc: save LOC by using while loop Use while loop instead of infinite loop with "break;". Also move some variable to the inner scope where they belong. Link: https://lkml.kernel.org/r/82c8f8e7-8ded-46ca-8857-e60b991d6205@p183 Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton --- fs/proc/inode.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 532dc9d240f7..5933c78af6de 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -110,18 +110,15 @@ void __init proc_init_kmemcache(void) void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock) { - struct inode *inode; - struct proc_inode *ei; struct hlist_node *node; struct super_block *old_sb = NULL; rcu_read_lock(); - for (;;) { + while ((node = hlist_first_rcu(inodes))) { + struct proc_inode *ei = hlist_entry(node, struct proc_inode, sibling_inodes); struct super_block *sb; - node = hlist_first_rcu(inodes); - if (!node) - break; - ei = hlist_entry(node, struct proc_inode, sibling_inodes); + struct inode *inode; + spin_lock(lock); hlist_del_init_rcu(&ei->sibling_inodes); spin_unlock(lock); From 71ca5ee18708c1f9f086e20ac0a657009bcfe43a Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 5 Oct 2023 14:35:17 -0700 Subject: [PATCH 43/64] get_maintainer: add --keywords-in-file option There were some recent attempts [1] [2] to make the K: field less noisy and its behavior more obvious. Ultimately, a shift in the default behavior and an associated command line flag is the best choice. Currently, K: will match keywords found in both patches and files. Matching content from entire files is (while documented) not obvious behavior and is usually not wanted by maintainers. Now only patch content will be matched against unless --keywords-in-file is also provided as an argument to get_maintainer. Add the actual keyword matched to the role or rolestats as well. For instance given the diff below that removes clang: : diff --git a/drivers/hid/bpf/entrypoints/README b/drivers/hid/bpf/entrypoints/README : index 147e0d41509f..f88eb19e8ef2 100644 : --- a/drivers/hid/bpf/entrypoints/README : +++ b/drivers/hid/bpf/entrypoints/README : @@ -1,4 +1,4 @@ : WARNING: : If you change "entrypoints.bpf.c" do "make -j" in this directory to rebuild "entrypoints.skel.h". : -Make sure to have clang 10 installed. : +Make sure to have 10 installed. : See Documentation/bpf/bpf_devel_QA.rst The new role/rolestats output includes ":Keyword:\b(?i:clang|llvm)\b" $ git diff drivers/hid/bpf/entrypoints/README | .scripts/get_maintainer.pl Jiri Kosina (maintainer:HID CORE LAYER,commit_signer:1/1=100%) Benjamin Tissoires (maintainer:HID CORE LAYER,commit_signer:1/1=100%,authored:1/1=100%,added_lines:4/4=100%) Nathan Chancellor (supporter:CLANG/LLVM BUILD SUPPORT:Keyword:\b(?i:clang|llvm)\b) Nick Desaulniers (supporter:CLANG/LLVM BUILD SUPPORT:Keyword:\b(?i:clang|llvm)\b) Tom Rix (reviewer:CLANG/LLVM BUILD SUPPORT:Keyword:\b(?i:clang|llvm)\b) Greg Kroah-Hartman (commit_signer:1/1=100%) linux-input@vger.kernel.org (open list:HID CORE LAYER) linux-kernel@vger.kernel.org (open list) llvm@lists.linux.dev (open list:CLANG/LLVM BUILD SUPPORT:Keyword:\b(?i:clang|llvm)\b) Link: https://lore.kernel.org/r/20231004-get_maintainer_change_k-v1-1-ac7ced18306a@google.com Link: https://lore.kernel.org/all/20230928-get_maintainer_add_d-v2-0-8acb3f394571@google.com Link: https://lore.kernel.org/all/3dca40b677dd2fef979a5a581a2db91df2c21801.camel@perches.com Original-patch-by: Justin Stitt Link: https://lkml.kernel.org/r/01fe46f0c58aa8baf92156ae2bdccfb2bf0cb48e.camel@perches.com Signed-off-by: Joe Perches Tested-by: Justin Stitt Cc: Kees Cook Cc: Nick Desaulniers Signed-off-by: Andrew Morton --- scripts/get_maintainer.pl | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl index ab123b498fd9..16d8ac6005b6 100755 --- a/scripts/get_maintainer.pl +++ b/scripts/get_maintainer.pl @@ -57,6 +57,7 @@ my $subsystem = 0; my $status = 0; my $letters = ""; my $keywords = 1; +my $keywords_in_file = 0; my $sections = 0; my $email_file_emails = 0; my $from_filename = 0; @@ -272,6 +273,7 @@ if (!GetOptions( 'letters=s' => \$letters, 'pattern-depth=i' => \$pattern_depth, 'k|keywords!' => \$keywords, + 'kf|keywords-in-file!' => \$keywords_in_file, 'sections!' => \$sections, 'fe|file-emails!' => \$email_file_emails, 'f|file' => \$from_filename, @@ -318,6 +320,7 @@ if ($sections || $letters ne "") { $subsystem = 0; $web = 0; $keywords = 0; + $keywords_in_file = 0; $interactive = 0; } else { my $selections = $email + $scm + $status + $subsystem + $web; @@ -548,16 +551,14 @@ foreach my $file (@ARGV) { $file =~ s/^\Q${cur_path}\E//; #strip any absolute path $file =~ s/^\Q${lk_path}\E//; #or the path to the lk tree push(@files, $file); - if ($file ne "MAINTAINERS" && -f $file && $keywords) { + if ($file ne "MAINTAINERS" && -f $file && $keywords && $keywords_in_file) { open(my $f, '<', $file) or die "$P: Can't open $file: $!\n"; my $text = do { local($/) ; <$f> }; close($f); - if ($keywords) { - foreach my $line (keys %keyword_hash) { - if ($text =~ m/$keyword_hash{$line}/x) { - push(@keyword_tvi, $line); - } + foreach my $line (keys %keyword_hash) { + if ($text =~ m/$keyword_hash{$line}/x) { + push(@keyword_tvi, $line); } } } @@ -919,7 +920,7 @@ sub get_maintainers { } foreach my $line (sort {$hash{$b} <=> $hash{$a}} keys %hash) { - add_categories($line); + add_categories($line, ""); if ($sections) { my $i; my $start = find_starting_index($line); @@ -947,7 +948,7 @@ sub get_maintainers { if ($keywords) { @keyword_tvi = sort_and_uniq(@keyword_tvi); foreach my $line (@keyword_tvi) { - add_categories($line); + add_categories($line, ":Keyword:$keyword_hash{$line}"); } } @@ -1076,6 +1077,7 @@ Output type options: Other options: --pattern-depth => Number of pattern directory traversals (default: 0 (all)) --keywords => scan patch for keywords (default: $keywords) + --keywords-in-file => scan file for keywords (default: $keywords_in_file) --sections => print all of the subsystem sections with pattern matches --letters => print all matching 'letter' types from all matching sections --mailmap => use .mailmap file (default: $email_use_mailmap) @@ -1086,7 +1088,7 @@ Other options: Default options: [--email --tree --nogit --git-fallback --m --r --n --l --multiline - --pattern-depth=0 --remove-duplicates --rolestats] + --pattern-depth=0 --remove-duplicates --rolestats --keywords] Notes: Using "-f directory" may give unexpected results: @@ -1312,7 +1314,7 @@ sub get_list_role { } sub add_categories { - my ($index) = @_; + my ($index, $suffix) = @_; my $i; my $start = find_starting_index($index); @@ -1342,7 +1344,7 @@ sub add_categories { if (!$hash_list_to{lc($list_address)}) { $hash_list_to{lc($list_address)} = 1; push(@list_to, [$list_address, - "subscriber list${list_role}"]); + "subscriber list${list_role}" . $suffix]); } } } else { @@ -1352,12 +1354,12 @@ sub add_categories { if ($email_moderated_list) { $hash_list_to{lc($list_address)} = 1; push(@list_to, [$list_address, - "moderated list${list_role}"]); + "moderated list${list_role}" . $suffix]); } } else { $hash_list_to{lc($list_address)} = 1; push(@list_to, [$list_address, - "open list${list_role}"]); + "open list${list_role}" . $suffix]); } } } @@ -1365,19 +1367,19 @@ sub add_categories { } elsif ($ptype eq "M") { if ($email_maintainer) { my $role = get_maintainer_role($i); - push_email_addresses($pvalue, $role); + push_email_addresses($pvalue, $role . $suffix); } } elsif ($ptype eq "R") { if ($email_reviewer) { my $subsystem = get_subsystem_name($i); - push_email_addresses($pvalue, "reviewer:$subsystem"); + push_email_addresses($pvalue, "reviewer:$subsystem" . $suffix); } } elsif ($ptype eq "T") { - push(@scm, $pvalue); + push(@scm, $pvalue . $suffix); } elsif ($ptype eq "W") { - push(@web, $pvalue); + push(@web, $pvalue . $suffix); } elsif ($ptype eq "S") { - push(@status, $pvalue); + push(@status, $pvalue . $suffix); } } } From fbd126f5a658b92c7f6af986a6d89cf5e5693268 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 22 Sep 2023 10:52:20 -0700 Subject: [PATCH 44/64] gcov: annotate struct gcov_iterator with __counted_by Prepare for the coming implementation by GCC and Clang of the __counted_by attribute. Flexible array members annotated with __counted_by can have their accesses bounds-checked at run-time checking via CONFIG_UBSAN_BOUNDS (for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family functions). As found with Coccinelle[1], add __counted_by for struct gcov_iterator. [1] https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci Link: https://lkml.kernel.org/r/20230922175220.work.327-kees@kernel.org Signed-off-by: Kees Cook Reviewed-by: Gustavo A. R. Silva Reviewed-by: Peter Oberparleiter Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Tom Rix Signed-off-by: Andrew Morton --- kernel/gcov/fs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c index 5c3086cad8f9..01520689b57c 100644 --- a/kernel/gcov/fs.c +++ b/kernel/gcov/fs.c @@ -99,7 +99,7 @@ struct gcov_iterator { struct gcov_info *info; size_t size; loff_t pos; - char buffer[]; + char buffer[] __counted_by(size); }; /** From 598f0ac1500d7145c696de4d38797b1dd2c651de Mon Sep 17 00:00:00 2001 From: David Laight Date: Thu, 5 Oct 2023 11:39:54 +0000 Subject: [PATCH 45/64] compiler.h: move __is_constexpr() to compiler.h Prior to f747e6667ebb2 __is_constexpr() was in its only user minmax.h. That commit moved it to const.h - but that file just defines ULL(x) and UL(x) so that constants can be defined for .S and .c files. So apart from the word 'const' it wasn't really a good location. Instead move the definition to compiler.h just before the similar is_signed_type() and is_unsigned_type(). This may not be a good long-term home, but the three definitions belong together. Link: https://lkml.kernel.org/r/2a6680bbe2e84459816a113730426782@AcuMS.aculab.com Signed-off-by: David Laight Reviewed-by: Kees Cook Cc: Arnd Bergmann Cc: Bart Van Assche Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Rasmus Villemoes Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- include/linux/compiler.h | 8 ++++++++ include/linux/const.h | 8 -------- tools/include/linux/compiler.h | 8 ++++++++ tools/include/linux/const.h | 8 -------- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 174099fdc485..bb1339c7057b 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -227,6 +227,14 @@ static inline void *offset_to_ptr(const int *off) /* &a[0] degrades to a pointer: a different type from an array */ #define __must_be_array(a) BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0])) +/* + * This returns a constant expression while determining if an argument is + * a constant expression, most importantly without evaluating the argument. + * Glory to Martin Uecker + */ +#define __is_constexpr(x) \ + (sizeof(int) == sizeof(*(8 ? ((void *)((long)(x) * 0l)) : (int *)8))) + /* * Whether 'type' is a signed type or an unsigned type. Supports scalar types, * bool and also pointer types. diff --git a/include/linux/const.h b/include/linux/const.h index 435ddd72d2c4..81b8aae5a855 100644 --- a/include/linux/const.h +++ b/include/linux/const.h @@ -3,12 +3,4 @@ #include -/* - * This returns a constant expression while determining if an argument is - * a constant expression, most importantly without evaluating the argument. - * Glory to Martin Uecker - */ -#define __is_constexpr(x) \ - (sizeof(int) == sizeof(*(8 ? ((void *)((long)(x) * 0l)) : (int *)8))) - #endif /* _LINUX_CONST_H */ diff --git a/tools/include/linux/compiler.h b/tools/include/linux/compiler.h index 1684216e826a..7b65566f3e42 100644 --- a/tools/include/linux/compiler.h +++ b/tools/include/linux/compiler.h @@ -63,6 +63,14 @@ # define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b)) #endif +/* + * This returns a constant expression while determining if an argument is + * a constant expression, most importantly without evaluating the argument. + * Glory to Martin Uecker + */ +#define __is_constexpr(x) \ + (sizeof(int) == sizeof(*(8 ? ((void *)((long)(x) * 0l)) : (int *)8))) + #ifdef __ANDROID__ /* * FIXME: Big hammer to get rid of tons of: diff --git a/tools/include/linux/const.h b/tools/include/linux/const.h index 435ddd72d2c4..81b8aae5a855 100644 --- a/tools/include/linux/const.h +++ b/tools/include/linux/const.h @@ -3,12 +3,4 @@ #include -/* - * This returns a constant expression while determining if an argument is - * a constant expression, most importantly without evaluating the argument. - * Glory to Martin Uecker - */ -#define __is_constexpr(x) \ - (sizeof(int) == sizeof(*(8 ? ((void *)((long)(x) * 0l)) : (int *)8))) - #endif /* _LINUX_CONST_H */ From 6e79b375adb38219099d7e3ccc973a7808108a3e Mon Sep 17 00:00:00 2001 From: Swarup Laxman Kotiaklapudi Date: Mon, 9 Oct 2023 14:07:14 +0300 Subject: [PATCH 46/64] proc: test /proc/${pid}/statm My original comment lied, output can be "0 A A B 0 0 0\n" (see comment in the code). I don't quite understand why get_mm_counter(mm, MM_FILEPAGES) + get_mm_counter(mm, MM_SHMEMPAGES) can stay positive but get_mm_counter(mm, MM_ANONPAGES) is always 0 after everything is unmapped but that's just me. [adobriyan@gmail.com: more or less rewritten] Link: https://lkml.kernel.org/r/0721ca69-7bb4-40aa-8d01-0c5f91e5f363@p183 Signed-off-by: Swarup Laxman Kotiaklapudi Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton --- tools/testing/selftests/proc/proc-empty-vm.c | 97 +++++++++++++++++++- 1 file changed, 92 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/proc/proc-empty-vm.c b/tools/testing/selftests/proc/proc-empty-vm.c index ee71ce52cb6a..171d46a1ac27 100644 --- a/tools/testing/selftests/proc/proc-empty-vm.c +++ b/tools/testing/selftests/proc/proc-empty-vm.c @@ -303,6 +303,95 @@ static int test_proc_pid_smaps_rollup(pid_t pid) } } +static const char *parse_u64(const char *p, const char *const end, uint64_t *rv) +{ + *rv = 0; + for (; p != end; p += 1) { + if ('0' <= *p && *p <= '9') { + assert(!__builtin_mul_overflow(*rv, 10, rv)); + assert(!__builtin_add_overflow(*rv, *p - '0', rv)); + } else { + break; + } + } + assert(p != end); + return p; +} + +/* + * There seems to be 2 types of valid output: + * "0 A A B 0 0 0\n" for dynamic exeuctables, + * "0 0 0 B 0 0 0\n" for static executables. + */ +static int test_proc_pid_statm(pid_t pid) +{ + char buf[4096]; + snprintf(buf, sizeof(buf), "/proc/%u/statm", pid); + int fd = open(buf, O_RDONLY); + if (fd == -1) { + perror("open /proc/${pid}/statm"); + return EXIT_FAILURE; + } + + ssize_t rv = read(fd, buf, sizeof(buf)); + close(fd); + + assert(rv >= 0); + assert(rv <= sizeof(buf)); + if (0) { + write(1, buf, rv); + } + + const char *p = buf; + const char *const end = p + rv; + + /* size */ + assert(p != end && *p++ == '0'); + assert(p != end && *p++ == ' '); + + uint64_t resident; + p = parse_u64(p, end, &resident); + assert(p != end && *p++ == ' '); + + uint64_t shared; + p = parse_u64(p, end, &shared); + assert(p != end && *p++ == ' '); + + uint64_t text; + p = parse_u64(p, end, &text); + assert(p != end && *p++ == ' '); + + assert(p != end && *p++ == '0'); + assert(p != end && *p++ == ' '); + + /* data */ + assert(p != end && *p++ == '0'); + assert(p != end && *p++ == ' '); + + assert(p != end && *p++ == '0'); + assert(p != end && *p++ == '\n'); + + assert(p == end); + + /* + * "text" is "mm->end_code - mm->start_code" at execve(2) time. + * munmap() doesn't change it. It can be anything (just link + * statically). It can't be 0 because executing to this point + * implies at least 1 page of code. + */ + assert(text > 0); + + /* + * These two are always equal. Always 0 for statically linked + * executables and sometimes 0 for dynamically linked executables. + * There is no way to tell one from another without parsing ELF + * which is too much for this test. + */ + assert(resident == shared); + + return EXIT_SUCCESS; +} + int main(void) { int rv = EXIT_SUCCESS; @@ -389,11 +478,9 @@ int main(void) if (rv == EXIT_SUCCESS) { rv = test_proc_pid_smaps_rollup(pid); } - /* - * TODO test /proc/${pid}/statm, task_statm() - * ->start_code, ->end_code aren't updated by munmap(). - * Output can be "0 0 0 2 0 0 0\n" where "2" can be anything. - */ + if (rv == EXIT_SUCCESS) { + rv = test_proc_pid_statm(pid); + } /* Cut the rope. */ int wstatus; From 1b13a7030504da9d379270675166342e7039817c Mon Sep 17 00:00:00 2001 From: Artem Chernyshev Date: Mon, 9 Oct 2023 17:11:11 +0300 Subject: [PATCH 47/64] fs: ocfs2: check status values Test return values before overwriting. Found by Linux Verification Center (linuxtesting.org) with SVACE. Link: https://lkml.kernel.org/r/20231009141111.149858-1-artem.chernyshev@red-soft.ru Signed-off-by: Artem Chernyshev Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/namei.c | 8 ++++++++ fs/ocfs2/quota_local.c | 4 ++++ 2 files changed, 12 insertions(+) diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 5cd6d7771cea..836c4279a979 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -1597,6 +1597,10 @@ static int ocfs2_rename(struct mnt_idmap *idmap, if (update_dot_dot) { status = ocfs2_update_entry(old_inode, handle, &old_inode_dot_dot_res, new_dir); + if (status < 0) { + mlog_errno(status); + goto bail; + } drop_nlink(old_dir); if (new_inode) { drop_nlink(new_inode); @@ -1636,6 +1640,10 @@ static int ocfs2_rename(struct mnt_idmap *idmap, INODE_CACHE(old_dir), old_dir_bh, OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } fe = (struct ocfs2_dinode *) old_dir_bh->b_data; ocfs2_set_links_count(fe, old_dir->i_nlink); ocfs2_journal_dirty(handle, old_dir_bh); diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index dfaae1e52412..e09842fc9d4d 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -1240,6 +1240,10 @@ int ocfs2_create_local_dquot(struct dquot *dquot) &od->dq_local_phys_blk, &pcount, NULL); + if (status < 0) { + mlog_errno(status); + goto out; + } /* Initialize dquot structure on disk */ status = ocfs2_local_write_dquot(dquot); From 68279f9c9f592e75d30a9ba5154a15e0a0b42ae8 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 11 Oct 2023 19:55:00 +0300 Subject: [PATCH 48/64] treewide: mark stuff as __ro_after_init __read_mostly predates __ro_after_init. Many variables which are marked __read_mostly should have been __ro_after_init from day 1. Also, mark some stuff as "const" and "__init" while I'm at it. [akpm@linux-foundation.org: revert sysctl_nr_open_min, sysctl_nr_open_max changes due to arm warning] [akpm@linux-foundation.org: coding-style cleanups] Link: https://lkml.kernel.org/r/4f6bb9c0-abba-4ee4-a7aa-89265e886817@p183 Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton --- block/bdev.c | 6 +++--- fs/anon_inodes.c | 4 ++-- fs/buffer.c | 4 ++-- fs/char_dev.c | 2 +- fs/dcache.c | 8 ++++---- fs/direct-io.c | 2 +- fs/eventpoll.c | 6 +++--- fs/fcntl.c | 2 +- fs/file_table.c | 2 +- fs/inode.c | 8 ++++---- fs/kernfs/mount.c | 5 +++-- fs/locks.c | 4 ++-- fs/namespace.c | 16 ++++++++-------- fs/notify/dnotify/dnotify.c | 6 +++--- fs/notify/fanotify/fanotify_user.c | 8 ++++---- fs/notify/inotify/inotify_user.c | 2 +- fs/pipe.c | 2 +- fs/userfaultfd.c | 2 +- kernel/audit_tree.c | 4 ++-- kernel/sched/core.c | 2 +- kernel/user_namespace.c | 2 +- kernel/workqueue.c | 16 ++++++++-------- lib/debugobjects.c | 2 +- mm/khugepaged.c | 2 +- mm/shmem.c | 8 ++++---- security/integrity/iint.c | 2 +- 26 files changed, 64 insertions(+), 63 deletions(-) diff --git a/block/bdev.c b/block/bdev.c index f3b13aa1b7d4..aea9143d8908 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -292,7 +292,7 @@ EXPORT_SYMBOL(thaw_bdev); */ static __cacheline_aligned_in_smp DEFINE_MUTEX(bdev_lock); -static struct kmem_cache * bdev_cachep __read_mostly; +static struct kmem_cache *bdev_cachep __ro_after_init; static struct inode *bdev_alloc_inode(struct super_block *sb) { @@ -361,13 +361,13 @@ static struct file_system_type bd_type = { .kill_sb = kill_anon_super, }; -struct super_block *blockdev_superblock __read_mostly; +struct super_block *blockdev_superblock __ro_after_init; EXPORT_SYMBOL_GPL(blockdev_superblock); void __init bdev_cache_init(void) { int err; - static struct vfsmount *bd_mnt; + static struct vfsmount *bd_mnt __ro_after_init; bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index 24192a7667ed..d26222b7eefe 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -24,8 +24,8 @@ #include -static struct vfsmount *anon_inode_mnt __read_mostly; -static struct inode *anon_inode_inode; +static struct vfsmount *anon_inode_mnt __ro_after_init; +static struct inode *anon_inode_inode __ro_after_init; /* * anon_inodefs_dname() is called from d_path(). diff --git a/fs/buffer.c b/fs/buffer.c index 12e9a71c693d..a19fef583116 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2988,13 +2988,13 @@ EXPORT_SYMBOL(try_to_free_buffers); /* * Buffer-head allocation */ -static struct kmem_cache *bh_cachep __read_mostly; +static struct kmem_cache *bh_cachep __ro_after_init; /* * Once the number of bh's in the machine exceeds this level, we start * stripping them in writeback. */ -static unsigned long max_buffer_heads; +static unsigned long max_buffer_heads __ro_after_init; int buffer_heads_over_limit; diff --git a/fs/char_dev.c b/fs/char_dev.c index 950b6919fb87..3d52f3d3ae77 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -25,7 +25,7 @@ #include "internal.h" -static struct kobj_map *cdev_map; +static struct kobj_map *cdev_map __ro_after_init; static DEFINE_MUTEX(chrdevs_lock); diff --git a/fs/dcache.c b/fs/dcache.c index 25ac74d30bff..0650ccdaa335 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -78,7 +78,7 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); EXPORT_SYMBOL(rename_lock); -static struct kmem_cache *dentry_cache __read_mostly; +static struct kmem_cache *dentry_cache __ro_after_init; const struct qstr empty_name = QSTR_INIT("", 0); EXPORT_SYMBOL(empty_name); @@ -96,9 +96,9 @@ EXPORT_SYMBOL(dotdot_name); * information, yet avoid using a prime hash-size or similar. */ -static unsigned int d_hash_shift __read_mostly; +static unsigned int d_hash_shift __ro_after_init; -static struct hlist_bl_head *dentry_hashtable __read_mostly; +static struct hlist_bl_head *dentry_hashtable __ro_after_init; static inline struct hlist_bl_head *d_hash(unsigned int hash) { @@ -3324,7 +3324,7 @@ static void __init dcache_init(void) } /* SLAB cache for __getname() consumers */ -struct kmem_cache *names_cachep __read_mostly; +struct kmem_cache *names_cachep __ro_after_init; EXPORT_SYMBOL(names_cachep); void __init vfs_caches_init_early(void) diff --git a/fs/direct-io.c b/fs/direct-io.c index 7bc494ee56b9..20533266ade6 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -151,7 +151,7 @@ struct dio { }; } ____cacheline_aligned_in_smp; -static struct kmem_cache *dio_cache __read_mostly; +static struct kmem_cache *dio_cache __ro_after_init; /* * How many pages are in the queue? diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 1d9a71a0c4c1..2877cc01cff1 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -256,10 +256,10 @@ static u64 loop_check_gen = 0; static struct eventpoll *inserting_into; /* Slab cache used to allocate "struct epitem" */ -static struct kmem_cache *epi_cache __read_mostly; +static struct kmem_cache *epi_cache __ro_after_init; /* Slab cache used to allocate "struct eppoll_entry" */ -static struct kmem_cache *pwq_cache __read_mostly; +static struct kmem_cache *pwq_cache __ro_after_init; /* * List of files with newly added links, where we may need to limit the number @@ -271,7 +271,7 @@ struct epitems_head { }; static struct epitems_head *tfile_check_list = EP_UNACTIVE_PTR; -static struct kmem_cache *ephead_cache __read_mostly; +static struct kmem_cache *ephead_cache __ro_after_init; static inline void free_ephead(struct epitems_head *head) { diff --git a/fs/fcntl.c b/fs/fcntl.c index e871009f6c88..c80a6acad742 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -844,7 +844,7 @@ int send_sigurg(struct fown_struct *fown) } static DEFINE_SPINLOCK(fasync_lock); -static struct kmem_cache *fasync_cache __read_mostly; +static struct kmem_cache *fasync_cache __ro_after_init; static void fasync_free_rcu(struct rcu_head *head) { diff --git a/fs/file_table.c b/fs/file_table.c index ee21b3da9d08..687d33865035 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -40,7 +40,7 @@ static struct files_stat_struct files_stat = { }; /* SLAB cache for file structures */ -static struct kmem_cache *filp_cachep __read_mostly; +static struct kmem_cache *filp_cachep __ro_after_init; static struct percpu_counter nr_files __cacheline_aligned_in_smp; diff --git a/fs/inode.c b/fs/inode.c index 84bc3c76e5cc..2b1d473a3631 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -54,9 +54,9 @@ * inode_hash_lock */ -static unsigned int i_hash_mask __read_mostly; -static unsigned int i_hash_shift __read_mostly; -static struct hlist_head *inode_hashtable __read_mostly; +static unsigned int i_hash_mask __ro_after_init; +static unsigned int i_hash_shift __ro_after_init; +static struct hlist_head *inode_hashtable __ro_after_init; static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock); /* @@ -70,7 +70,7 @@ EXPORT_SYMBOL(empty_aops); static DEFINE_PER_CPU(unsigned long, nr_inodes); static DEFINE_PER_CPU(unsigned long, nr_unused); -static struct kmem_cache *inode_cachep __read_mostly; +static struct kmem_cache *inode_cachep __ro_after_init; static long get_nr_inodes(void) { diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index c4bf26142eec..43aea0ad95c8 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -21,8 +21,9 @@ #include "kernfs-internal.h" -struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache; -struct kernfs_global_locks *kernfs_locks; +struct kmem_cache *kernfs_node_cache __ro_after_init; +struct kmem_cache *kernfs_iattrs_cache __ro_after_init; +struct kernfs_global_locks *kernfs_locks __ro_after_init; static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry) { diff --git a/fs/locks.c b/fs/locks.c index 76ad05f8070a..dbd2fb1f7494 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -167,8 +167,8 @@ static DEFINE_HASHTABLE(blocked_hash, BLOCKED_HASH_BITS); */ static DEFINE_SPINLOCK(blocked_lock_lock); -static struct kmem_cache *flctx_cache __read_mostly; -static struct kmem_cache *filelock_cache __read_mostly; +static struct kmem_cache *flctx_cache __ro_after_init; +static struct kmem_cache *filelock_cache __ro_after_init; static struct file_lock_context * locks_get_lock_context(struct inode *inode, int type) diff --git a/fs/namespace.c b/fs/namespace.c index e157efc54023..69df848fbc17 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -39,10 +39,10 @@ /* Maximum number of mounts in a mount namespace */ static unsigned int sysctl_mount_max __read_mostly = 100000; -static unsigned int m_hash_mask __read_mostly; -static unsigned int m_hash_shift __read_mostly; -static unsigned int mp_hash_mask __read_mostly; -static unsigned int mp_hash_shift __read_mostly; +static unsigned int m_hash_mask __ro_after_init; +static unsigned int m_hash_shift __ro_after_init; +static unsigned int mp_hash_mask __ro_after_init; +static unsigned int mp_hash_shift __ro_after_init; static __initdata unsigned long mhash_entries; static int __init set_mhash_entries(char *str) @@ -68,9 +68,9 @@ static u64 event; static DEFINE_IDA(mnt_id_ida); static DEFINE_IDA(mnt_group_ida); -static struct hlist_head *mount_hashtable __read_mostly; -static struct hlist_head *mountpoint_hashtable __read_mostly; -static struct kmem_cache *mnt_cache __read_mostly; +static struct hlist_head *mount_hashtable __ro_after_init; +static struct hlist_head *mountpoint_hashtable __ro_after_init; +static struct kmem_cache *mnt_cache __ro_after_init; static DECLARE_RWSEM(namespace_sem); static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ @@ -86,7 +86,7 @@ struct mount_kattr { }; /* /sys/fs */ -struct kobject *fs_kobj; +struct kobject *fs_kobj __ro_after_init; EXPORT_SYMBOL_GPL(fs_kobj); /* diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index ebdcc25df0f7..7914d223289a 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -39,9 +39,9 @@ static void __init dnotify_sysctl_init(void) #define dnotify_sysctl_init() do { } while (0) #endif -static struct kmem_cache *dnotify_struct_cache __read_mostly; -static struct kmem_cache *dnotify_mark_cache __read_mostly; -static struct fsnotify_group *dnotify_group __read_mostly; +static struct kmem_cache *dnotify_struct_cache __ro_after_init; +static struct kmem_cache *dnotify_mark_cache __ro_after_init; +static struct fsnotify_group *dnotify_group __ro_after_init; /* * dnotify will attach one of these to each inode (i_fsnotify_marks) which diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index f69c451018e3..614b435c4a8c 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -112,10 +112,10 @@ static void __init fanotify_sysctls_init(void) extern const struct fsnotify_ops fanotify_fsnotify_ops; -struct kmem_cache *fanotify_mark_cache __read_mostly; -struct kmem_cache *fanotify_fid_event_cachep __read_mostly; -struct kmem_cache *fanotify_path_event_cachep __read_mostly; -struct kmem_cache *fanotify_perm_event_cachep __read_mostly; +struct kmem_cache *fanotify_mark_cache __ro_after_init; +struct kmem_cache *fanotify_fid_event_cachep __ro_after_init; +struct kmem_cache *fanotify_path_event_cachep __ro_after_init; +struct kmem_cache *fanotify_perm_event_cachep __ro_after_init; #define FANOTIFY_EVENT_ALIGN 4 #define FANOTIFY_FID_INFO_HDR_LEN \ diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 1c4bfdab008d..a3809ae92170 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -49,7 +49,7 @@ /* configurable via /proc/sys/fs/inotify/ */ static int inotify_max_queued_events __read_mostly; -struct kmem_cache *inotify_inode_mark_cachep __read_mostly; +struct kmem_cache *inotify_inode_mark_cachep __ro_after_init; #ifdef CONFIG_SYSCTL diff --git a/fs/pipe.c b/fs/pipe.c index 139190165a1c..6b279abf0129 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -854,7 +854,7 @@ void free_pipe_info(struct pipe_inode_info *pipe) kfree(pipe); } -static struct vfsmount *pipe_mnt __read_mostly; +static struct vfsmount *pipe_mnt __ro_after_init; /* * pipefs_dname() is called from d_path(). diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 56eaae9dac1a..ed09d70027a0 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -49,7 +49,7 @@ static struct ctl_table vm_userfaultfd_table[] = { }; #endif -static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly; +static struct kmem_cache *userfaultfd_ctx_cachep __ro_after_init; /* * Start with fault_pending_wqh and fault_wqh so they're more likely diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index e867c17d3f84..b21b9652c1a8 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -87,8 +87,8 @@ static struct task_struct *prune_thread; * that makes a difference. Some. */ -static struct fsnotify_group *audit_tree_group; -static struct kmem_cache *audit_tree_mark_cachep __read_mostly; +static struct fsnotify_group *audit_tree_group __ro_after_init; +static struct kmem_cache *audit_tree_mark_cachep __ro_after_init; static struct audit_tree *alloc_tree(const char *s) { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 802551e0009b..7d4cf741e087 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9903,7 +9903,7 @@ struct task_group root_task_group; LIST_HEAD(task_groups); /* Cacheline aligned slab cache for task_group */ -static struct kmem_cache *task_group_cache __read_mostly; +static struct kmem_cache *task_group_cache __ro_after_init; #endif void __init sched_init(void) diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 1d8e47bed3f1..bf2cb8c11571 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -22,7 +22,7 @@ #include #include -static struct kmem_cache *user_ns_cachep __read_mostly; +static struct kmem_cache *user_ns_cachep __ro_after_init; static DEFINE_MUTEX(userns_state_mutex); static bool new_idmap_permitted(const struct file *file, diff --git a/kernel/workqueue.c b/kernel/workqueue.c index b9f053a5a5f0..96b89f0edbe3 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -418,21 +418,21 @@ static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS]; * process context while holding a pool lock. Bounce to a dedicated kthread * worker to avoid A-A deadlocks. */ -static struct kthread_worker *pwq_release_worker; +static struct kthread_worker *pwq_release_worker __ro_after_init; -struct workqueue_struct *system_wq __read_mostly; +struct workqueue_struct *system_wq __ro_after_init; EXPORT_SYMBOL(system_wq); -struct workqueue_struct *system_highpri_wq __read_mostly; +struct workqueue_struct *system_highpri_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_highpri_wq); -struct workqueue_struct *system_long_wq __read_mostly; +struct workqueue_struct *system_long_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_long_wq); -struct workqueue_struct *system_unbound_wq __read_mostly; +struct workqueue_struct *system_unbound_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_unbound_wq); -struct workqueue_struct *system_freezable_wq __read_mostly; +struct workqueue_struct *system_freezable_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_freezable_wq); -struct workqueue_struct *system_power_efficient_wq __read_mostly; +struct workqueue_struct *system_power_efficient_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_power_efficient_wq); -struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly; +struct workqueue_struct *system_freezable_power_efficient_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq); static int worker_thread(void *__worker); diff --git a/lib/debugobjects.c b/lib/debugobjects.c index a517256a270b..2a8e9d63fbe3 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -89,7 +89,7 @@ static int debug_objects_pool_size __read_mostly static int debug_objects_pool_min_level __read_mostly = ODEBUG_POOL_MIN_LEVEL; static const struct debug_obj_descr *descr_test __read_mostly; -static struct kmem_cache *obj_cache __read_mostly; +static struct kmem_cache *obj_cache __ro_after_init; /* * Track numbers of kmem_cache_alloc()/free() calls done. diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 88433cc25d8a..cb3f1d738810 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -91,7 +91,7 @@ static unsigned int khugepaged_max_ptes_shared __read_mostly; #define MM_SLOTS_HASH_BITS 10 static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); -static struct kmem_cache *mm_slot_cache __read_mostly; +static struct kmem_cache *mm_slot_cache __ro_after_init; struct collapse_control { bool is_khugepaged; diff --git a/mm/shmem.c b/mm/shmem.c index 69595d341882..389212972e72 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -42,7 +42,7 @@ #include #include "swap.h" -static struct vfsmount *shm_mnt; +static struct vfsmount *shm_mnt __ro_after_init; #ifdef CONFIG_SHMEM /* @@ -4394,7 +4394,7 @@ static const struct fs_context_operations shmem_fs_context_ops = { #endif }; -static struct kmem_cache *shmem_inode_cachep; +static struct kmem_cache *shmem_inode_cachep __ro_after_init; static struct inode *shmem_alloc_inode(struct super_block *sb) { @@ -4426,14 +4426,14 @@ static void shmem_init_inode(void *foo) inode_init_once(&info->vfs_inode); } -static void shmem_init_inodecache(void) +static void __init shmem_init_inodecache(void) { shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", sizeof(struct shmem_inode_info), 0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode); } -static void shmem_destroy_inodecache(void) +static void __init shmem_destroy_inodecache(void) { kmem_cache_destroy(shmem_inode_cachep); } diff --git a/security/integrity/iint.c b/security/integrity/iint.c index a462df827de2..5b1c2de6cc54 100644 --- a/security/integrity/iint.c +++ b/security/integrity/iint.c @@ -23,7 +23,7 @@ static struct rb_root integrity_iint_tree = RB_ROOT; static DEFINE_RWLOCK(integrity_iint_lock); -static struct kmem_cache *iint_cache __read_mostly; +static struct kmem_cache *iint_cache __ro_after_init; struct dentry *integrity_dir; From 94a03e1d22e8dc75ddec159b2efadd4c6354503a Mon Sep 17 00:00:00 2001 From: Hu Haowen <2023002089@link.tyut.edu.cn> Date: Fri, 13 Oct 2023 21:28:32 +0800 Subject: [PATCH 49/64] scripts/show_delta: add __main__ judgement before main code When doing Python programming it is a nice convention to insert the if statement `if __name__ == "__main__":` before any main code that does actual functionalities to ensure the code will be executed only as a script rather than as an imported module. Hence attach the missing judgement to show_delta. Link: https://lkml.kernel.org/r/20231013132832.165768-1-2023002089@link.tyut.edu.cn Signed-off-by: Hu Haowen <2023002089@link.tyut.edu.cn> Reviewed-by: Nick Desaulniers Reviewed-by: Miguel Ojeda Cc: Greg Kroah-Hartman Cc: Masahiro Yamada Cc: Nicolas Schier Signed-off-by: Andrew Morton --- scripts/show_delta | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/show_delta b/scripts/show_delta index 28e67e178194..291ad65e3089 100755 --- a/scripts/show_delta +++ b/scripts/show_delta @@ -125,4 +125,5 @@ def main(): for line in lines: print (convert_line(line, base_time),) -main() +if __name__ == "__main__": + main() From 5176140c5094b5bfd35e19a4f6ab8a10b65da380 Mon Sep 17 00:00:00 2001 From: Yuanheng Zhang Date: Thu, 12 Oct 2023 00:32:16 +0800 Subject: [PATCH 50/64] ocfs2: fix a typo in a comment Fix spelling typo in comment. Link: https://lkml.kernel.org/r/20231011163216.29446-1-yuanhengzhang1214@gmail.com Signed-off-by: Yuanheng Zhang Acked-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/buffer_head_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c index 196638a22b48..6cb919f60011 100644 --- a/fs/ocfs2/buffer_head_io.c +++ b/fs/ocfs2/buffer_head_io.c @@ -158,7 +158,7 @@ read_failure: if (new_bh && bh) { /* If middle bh fails, let previous bh * finish its read and then put it to - * aovoid bh leak + * avoid bh leak */ if (!buffer_jbd(bh)) wait_on_buffer(bh); From 873ed7222c17cfd3cba1b1147552171581ffa6ca Mon Sep 17 00:00:00 2001 From: Jia Rui Date: Thu, 19 Oct 2023 03:18:11 +0800 Subject: [PATCH 51/64] ocfs2: replace BUG_ON() at ocfs2_num_free_extents() with ocfs2_error() The BUG_ON() at ocfs2_num_free_extents() handles the error that l_tree_deepth of leaf extent block just read form disk is invalid. This error is mostly caused by file system metadata corruption on the disk. There is no need to call BUG_ON() to handle such errors. We can return error code, since the caller can deal with errors from ocfs2_num_free_extents(). Also, we should make the file system read-only to avoid the damage from expanding. Therefore, BUG_ON() is removed and ocfs2_error() is called instead. Link: https://lkml.kernel.org/r/20231018191811.412458-1-jindui71@gmail.com Signed-off-by: Jia Rui Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/alloc.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 0fb5e3a875d2..dea3de833b47 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -967,7 +967,14 @@ int ocfs2_num_free_extents(struct ocfs2_extent_tree *et) el = &eb->h_list; } - BUG_ON(el->l_tree_depth != 0); + if (el->l_tree_depth != 0) { + retval = ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu has leaf extent block %llu with an invalid l_tree_depth of %u\n", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + (unsigned long long)last_eb_blk, + le16_to_cpu(el->l_tree_depth)); + goto bail; + } retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec); bail: From 2320222067887f58dc9d7dba2e3ec285d02d45f3 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 23 Oct 2023 17:33:43 +0200 Subject: [PATCH 52/64] do_io_accounting: use __for_each_thread() Rather than while_each_thread() which should be avoided when possible. This makes the code more clear and allows the next change. Link: https://lkml.kernel.org/r/20231023153343.GA4629@redhat.com Signed-off-by: Oleg Nesterov Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton --- fs/proc/base.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 601329f9629a..7779efda9fe2 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2972,7 +2972,7 @@ static const struct file_operations proc_coredump_filter_operations = { #ifdef CONFIG_TASK_IO_ACCOUNTING static int do_io_accounting(struct task_struct *task, struct seq_file *m, int whole) { - struct task_io_accounting acct = task->ioac; + struct task_io_accounting acct; unsigned long flags; int result; @@ -2986,14 +2986,18 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh } if (whole && lock_task_sighand(task, &flags)) { - struct task_struct *t = task; + struct signal_struct *sig = task->signal; + struct task_struct *t; - task_io_accounting_add(&acct, &task->signal->ioac); - while_each_thread(task, t) + acct = sig->ioac; + __for_each_thread(sig, t) task_io_accounting_add(&acct, &t->ioac); unlock_task_sighand(task, &flags); + } else { + acct = task->ioac; } + seq_printf(m, "rchar: %llu\n" "wchar: %llu\n" From 1df4bd83cdfdbd0720ddb2c6488b7e9a432ba468 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 23 Oct 2023 17:34:05 +0200 Subject: [PATCH 53/64] do_io_accounting: use sig->stats_lock Rather than lock_task_sighand(), sig->stats_lock was specifically designed for this type of use. This way the "if (whole)" branch runs lockless in the likely case. Link: https://lkml.kernel.org/r/20231023153405.GA4639@redhat.com Signed-off-by: Oleg Nesterov Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton --- fs/proc/base.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 7779efda9fe2..ca4f0ce64df6 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2973,7 +2973,6 @@ static const struct file_operations proc_coredump_filter_operations = { static int do_io_accounting(struct task_struct *task, struct seq_file *m, int whole) { struct task_io_accounting acct; - unsigned long flags; int result; result = down_read_killable(&task->signal->exec_update_lock); @@ -2985,15 +2984,24 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh goto out_unlock; } - if (whole && lock_task_sighand(task, &flags)) { + if (whole) { struct signal_struct *sig = task->signal; struct task_struct *t; + unsigned int seq = 1; + unsigned long flags; - acct = sig->ioac; - __for_each_thread(sig, t) - task_io_accounting_add(&acct, &t->ioac); + rcu_read_lock(); + do { + seq++; /* 2 on the 1st/lockless path, otherwise odd */ + flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq); - unlock_task_sighand(task, &flags); + acct = sig->ioac; + __for_each_thread(sig, t) + task_io_accounting_add(&acct, &t->ioac); + + } while (need_seqretry(&sig->stats_lock, seq)); + done_seqretry_irqrestore(&sig->stats_lock, seq, flags); + rcu_read_unlock(); } else { acct = task->ioac; } From 639931020e1a70308a5c71f4702443429cb6899c Mon Sep 17 00:00:00 2001 From: Yang Li Date: Thu, 26 Oct 2023 08:56:34 +0800 Subject: [PATCH 54/64] fs/proc/base.c: remove unneeded semicolon ./fs/proc/base.c:3829:2-3: Unneeded semicolon Link: https://lkml.kernel.org/r/20231026005634.6581-1-yang.lee@linux.alibaba.com Signed-off-by: Yang Li Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=7057 Acked-by: Oleg Nesterov Signed-off-by: Andrew Morton --- fs/proc/base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index ca4f0ce64df6..b13d3e804deb 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3826,7 +3826,7 @@ static struct task_struct *first_tid(struct pid *pid, int tid, loff_t f_pos, for_each_thread(task, pos) { if (!nr--) goto found; - }; + } fail: pos = NULL; goto out; From 20e34aa7e08dbac5d7f757fea81fae8df462aa42 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 27 Oct 2023 17:21:03 +0300 Subject: [PATCH 55/64] proc: fix proc-empty-vm test with vsyscall * fix embarassing /proc/*/smaps test bug due to a typo in variable name it tested only the first line of the output if vsyscall is enabled: ffffffffff600000-ffffffffff601000 r-xp ... so test passed but tested only VMA location and permissions. * add "KSM" entry, unnoticed because (1) * swap "r-xp" and "--xp" vsyscall test strings, also unnoticed because (1) Link: https://lkml.kernel.org/r/76f42cce-b1ab-45ec-b6b2-4c64f0dccb90@p183 Signed-off-by: Alexey Dobriyan Tested-by: Swarup Laxman Kotikalapudi Signed-off-by: Andrew Morton --- tools/testing/selftests/proc/proc-empty-vm.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/proc/proc-empty-vm.c b/tools/testing/selftests/proc/proc-empty-vm.c index 171d46a1ac27..1cbdfe8a140e 100644 --- a/tools/testing/selftests/proc/proc-empty-vm.c +++ b/tools/testing/selftests/proc/proc-empty-vm.c @@ -60,7 +60,7 @@ static const char proc_pid_maps_vsyscall_2[] = static const char proc_pid_smaps_vsyscall_0[] = ""; static const char proc_pid_smaps_vsyscall_1[] = -"ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]\n" +"ffffffffff600000-ffffffffff601000 --xp 00000000 00:00 0 [vsyscall]\n" "Size: 4 kB\n" "KernelPageSize: 4 kB\n" "MMUPageSize: 4 kB\n" @@ -73,6 +73,7 @@ static const char proc_pid_smaps_vsyscall_1[] = "Private_Dirty: 0 kB\n" "Referenced: 0 kB\n" "Anonymous: 0 kB\n" +"KSM: 0 kB\n" "LazyFree: 0 kB\n" "AnonHugePages: 0 kB\n" "ShmemPmdMapped: 0 kB\n" @@ -90,7 +91,7 @@ static const char proc_pid_smaps_vsyscall_1[] = ; static const char proc_pid_smaps_vsyscall_2[] = -"ffffffffff600000-ffffffffff601000 --xp 00000000 00:00 0 [vsyscall]\n" +"ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]\n" "Size: 4 kB\n" "KernelPageSize: 4 kB\n" "MMUPageSize: 4 kB\n" @@ -103,6 +104,7 @@ static const char proc_pid_smaps_vsyscall_2[] = "Private_Dirty: 0 kB\n" "Referenced: 0 kB\n" "Anonymous: 0 kB\n" +"KSM: 0 kB\n" "LazyFree: 0 kB\n" "AnonHugePages: 0 kB\n" "ShmemPmdMapped: 0 kB\n" @@ -244,10 +246,10 @@ static int test_proc_pid_smaps(pid_t pid) if (g_vsyscall == 0) { assert(rv == 0); } else { - size_t len = strlen(g_proc_pid_maps_vsyscall); + size_t len = strlen(g_proc_pid_smaps_vsyscall); /* TODO "ProtectionKey:" */ assert(rv > len); - assert(memcmp(buf, g_proc_pid_maps_vsyscall, len) == 0); + assert(memcmp(buf, g_proc_pid_smaps_vsyscall, len) == 0); } return EXIT_SUCCESS; } From bf5add391eeb450b502d2593c05f84982724e6a2 Mon Sep 17 00:00:00 2001 From: Swarup Laxman Kotiaklapudi Date: Fri, 27 Oct 2023 17:26:24 +0300 Subject: [PATCH 56/64] proc: test ProtectionKey in proc-empty-vm test Check ProtectionKey field in /proc/*/smaps output, if system supports protection keys feature. [adobriyan@gmail.com: test support in the beginning of the program, use syscall, not glibc pkey_alloc(3) which may not compile] Link: https://lkml.kernel.org/r/ac05efa7-d2a0-48ad-b704-ffdd5450582e@p183 Signed-off-by: Swarup Laxman Kotiaklapudi Signed-off-by: Alexey Dobriyan Reviewed-by: Swarup Laxman Kotikalapudi Tested-by: Swarup Laxman Kotikalapudi Signed-off-by: Andrew Morton --- tools/testing/selftests/proc/proc-empty-vm.c | 83 +++++++++++++++----- 1 file changed, 63 insertions(+), 20 deletions(-) diff --git a/tools/testing/selftests/proc/proc-empty-vm.c b/tools/testing/selftests/proc/proc-empty-vm.c index 1cbdfe8a140e..56198d4ca2bf 100644 --- a/tools/testing/selftests/proc/proc-empty-vm.c +++ b/tools/testing/selftests/proc/proc-empty-vm.c @@ -23,6 +23,9 @@ * /proc/${pid}/smaps * /proc/${pid}/smaps_rollup */ +#undef _GNU_SOURCE +#define _GNU_SOURCE + #undef NDEBUG #include #include @@ -34,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -42,6 +46,43 @@ #define TEST_VSYSCALL #endif +#if defined __amd64__ + #ifndef SYS_pkey_alloc + #define SYS_pkey_alloc 330 + #endif + #ifndef SYS_pkey_free + #define SYS_pkey_free 331 + #endif +#elif defined __i386__ + #ifndef SYS_pkey_alloc + #define SYS_pkey_alloc 381 + #endif + #ifndef SYS_pkey_free + #define SYS_pkey_free 382 + #endif +#else + #error "SYS_pkey_alloc" +#endif + +static int g_protection_key_support; + +static int protection_key_support(void) +{ + long rv = syscall(SYS_pkey_alloc, 0, 0); + if (rv > 0) { + syscall(SYS_pkey_free, (int)rv); + return 1; + } else if (rv == -1 && errno == ENOSYS) { + return 0; + } else if (rv == -1 && errno == EINVAL) { + // ospke=n + return 0; + } else { + fprintf(stderr, "%s: error: rv %ld, errno %d\n", __func__, rv, errno); + exit(EXIT_FAILURE); + } +} + /* * 0: vsyscall VMA doesn't exist vsyscall=none * 1: vsyscall VMA is --xp vsyscall=xonly @@ -84,10 +125,6 @@ static const char proc_pid_smaps_vsyscall_1[] = "SwapPss: 0 kB\n" "Locked: 0 kB\n" "THPeligible: 0\n" -/* - * "ProtectionKey:" field is conditional. It is possible to check it as well, - * but I don't have such machine. - */ ; static const char proc_pid_smaps_vsyscall_2[] = @@ -115,10 +152,6 @@ static const char proc_pid_smaps_vsyscall_2[] = "SwapPss: 0 kB\n" "Locked: 0 kB\n" "THPeligible: 0\n" -/* - * "ProtectionKey:" field is conditional. It is possible to check it as well, - * but I'm too tired. - */ ; static void sigaction_SIGSEGV(int _, siginfo_t *__, void *___) @@ -240,19 +273,27 @@ static int test_proc_pid_smaps(pid_t pid) } perror("open /proc/${pid}/smaps"); return EXIT_FAILURE; - } else { - ssize_t rv = read(fd, buf, sizeof(buf)); - close(fd); - if (g_vsyscall == 0) { - assert(rv == 0); - } else { - size_t len = strlen(g_proc_pid_smaps_vsyscall); - /* TODO "ProtectionKey:" */ - assert(rv > len); - assert(memcmp(buf, g_proc_pid_smaps_vsyscall, len) == 0); - } - return EXIT_SUCCESS; } + ssize_t rv = read(fd, buf, sizeof(buf)); + close(fd); + + assert(0 <= rv); + assert(rv <= sizeof(buf)); + + if (g_vsyscall == 0) { + assert(rv == 0); + } else { + size_t len = strlen(g_proc_pid_smaps_vsyscall); + assert(rv > len); + assert(memcmp(buf, g_proc_pid_smaps_vsyscall, len) == 0); + + if (g_protection_key_support) { +#define PROTECTION_KEY "ProtectionKey: 0\n" + assert(memmem(buf, rv, PROTECTION_KEY, strlen(PROTECTION_KEY))); + } + } + + return EXIT_SUCCESS; } static const char g_smaps_rollup[] = @@ -419,6 +460,8 @@ int main(void) abort(); } + g_protection_key_support = protection_key_support(); + pid_t pid = fork(); if (pid == -1) { perror("fork"); From e3bc0c427f2aee757c249e0f087b0a0e810d66e3 Mon Sep 17 00:00:00 2001 From: Kunwu Chan Date: Wed, 25 Oct 2023 15:29:06 +0800 Subject: [PATCH 57/64] ocfs2: fix a spelling typo in comment Fix a spelling typo in comment. Link: https://lkml.kernel.org/r/20231025072906.14285-1-chentao@kylinos.cn Signed-off-by: Kunwu Chan Acked-by: Joseph Qi Signed-off-by: Andrew Morton --- fs/ocfs2/buffer_head_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c index 6cb919f60011..cdb9b9bdea1f 100644 --- a/fs/ocfs2/buffer_head_io.c +++ b/fs/ocfs2/buffer_head_io.c @@ -345,7 +345,7 @@ read_failure: if (new_bh && bh) { /* If middle bh fails, let previous bh * finish its read and then put it to - * aovoid bh leak + * avoid bh leak */ if (!buffer_jbd(bh)) wait_on_buffer(bh); From cd24f44050f31d69ed5851b55ef77ea6346aa814 Mon Sep 17 00:00:00 2001 From: Deepak Gupta Date: Thu, 26 Oct 2023 16:38:23 -0700 Subject: [PATCH 58/64] scripts/gdb: add lx_current support for riscv csr_sscratch CSR holds current task_struct address when hart is in user space. Trap handler on entry spills csr_sscratch into "tp" (x2) register and zeroes out csr_sscratch CSR. Trap handler on exit reloads "tp" with expected user mode value and place current task_struct address again in csr_sscratch CSR. This patch assumes "tp" is pointing to task_struct. If value in csr_sscratch is numerically greater than "tp" then it assumes csr_sscratch is correct address of current task_struct. This logic holds when - hart is in user space, "tp" will be less than csr_sscratch. - hart is in kernel space but not in trap handler, "tp" will be more than csr_sscratch (csr_sscratch being equal to 0). - hart is executing trap handler - "tp" is still pointing to user mode but csr_sscratch contains ptr to task_struct. Thus numerically higher. - "tp" is pointing to task_struct but csr_sscratch now contains either 0 or numerically smaller value (transiently holds user mode tp) Link: https://lkml.kernel.org/r/20231026233837.612405-1-debug@rivosinc.com Signed-off-by: Deepak Gupta Reviewed-by: Andrew Jones Reviewed-by: Palmer Dabbelt Acked-by: Palmer Dabbelt Tested-by: Hsieh-Tseng Shen Cc: Albert Ou Cc: Glenn Washburn Cc: Jan Kiszka Cc: Jeff Xie Cc: Kieran Bingham Cc: Palmer Dabbelt Cc: Paul Walmsley Signed-off-by: Andrew Morton --- scripts/gdb/linux/cpus.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/scripts/gdb/linux/cpus.py b/scripts/gdb/linux/cpus.py index 255dc18cb9da..cba589e5b57d 100644 --- a/scripts/gdb/linux/cpus.py +++ b/scripts/gdb/linux/cpus.py @@ -179,6 +179,21 @@ def get_current_task(cpu): else: raise gdb.GdbError("Sorry, obtaining the current task is not allowed " "while running in userspace(EL0)") + elif utils.is_target_arch("riscv"): + current_tp = gdb.parse_and_eval("$tp") + scratch_reg = gdb.parse_and_eval("$sscratch") + + # by default tp points to current task + current_task = current_tp.cast(task_ptr_type) + + # scratch register is set 0 in trap handler after entering kernel. + # When hart is in user mode, scratch register is pointing to task_struct. + # and tp is used by user mode. So when scratch register holds larger value + # (negative address as ulong is larger value) than tp, then use scratch register. + if (scratch_reg.cast(utils.get_ulong_type()) > current_tp.cast(utils.get_ulong_type())): + current_task = scratch_reg.cast(task_ptr_type) + + return current_task.dereference() else: raise gdb.GdbError("Sorry, obtaining the current task is not yet " "supported with this arch") From 90723a82d8a54d98b3ed77161eb5f786ec2b3927 Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Mon, 30 Oct 2023 21:24:55 +0700 Subject: [PATCH 59/64] .mailmap: map Benjamin Poirier's address Map out to his gmail address as he had left SUSE some time ago. Link: https://lkml.kernel.org/r/20231030142454.22127-2-bagasdotme@gmail.com Signed-off-by: Bagas Sanjaya Acked-by: Benjamin Poirier Cc: Bjorn Andersson Cc: Greg Kroah-Hartman Cc: Heiko Stuebner Cc: Jakub Kicinski Cc: Konrad Dybcio Cc: Oleksij Rempel Cc: Stephen Hemminger Signed-off-by: Andrew Morton --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index a0a6efe87186..9a8b0f6066fa 100644 --- a/.mailmap +++ b/.mailmap @@ -94,6 +94,7 @@ Ben M Cahill Ben Widawsky Ben Widawsky Ben Widawsky +Benjamin Poirier Bjorn Andersson Bjorn Andersson Bjorn Andersson From 2ffc27b15b11c9584ac46335c2ed2248d2aa4137 Mon Sep 17 00:00:00 2001 From: Itaru Kitayama Date: Mon, 30 Oct 2023 17:54:45 +0900 Subject: [PATCH 60/64] tools/testing/selftests/mm/run_vmtests.sh: lower the ptrace permissions On Ubuntu and probably other distros, ptrace permissions are tightend a bit by default; i.e., /proc/sys/kernel/yama/ptrace_score is set to 1. This cases memfd_secret's ptrace attach test fails with a permission error. Set it to 0 piror to running the program. Link: https://lkml.kernel.org/r/20231030-selftest-v1-1-743df68bb996@linux.dev Signed-off-by: Itaru Kitayama Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/run_vmtests.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index 3e2bc818d566..7d31718ce834 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -303,6 +303,7 @@ CATEGORY="hmm" run_test bash ./test_hmm.sh smoke # MADV_POPULATE_READ and MADV_POPULATE_WRITE tests CATEGORY="madv_populate" run_test ./madv_populate +echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope CATEGORY="memfd_secret" run_test ./memfd_secret # KSM KSM_MERGE_TIME_HUGE_PAGES test with size of 100 From fbbc2af38463add04af0117671b006866052e43b Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Mon, 30 Oct 2023 08:36:32 +0200 Subject: [PATCH 61/64] mailmap: update email address for Claudiu Beznea Claudiu Beznea's Microchip email address is no longer valid. Map it to a valid one. Link: https://lkml.kernel.org/r/20231030063632.1707372-1-claudiu.beznea@tuxon.dev Signed-off-by: Claudiu Beznea Cc: Bjorn Andersson Cc: Heiko Stuebner Cc: Jakub Kicinski Cc: Jens Axboe Cc: Konrad Dybcio Cc: Oleksij Rempel Signed-off-by: Andrew Morton --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index 9a8b0f6066fa..b325732e31eb 100644 --- a/.mailmap +++ b/.mailmap @@ -128,6 +128,7 @@ Christian Brauner Christian Marangi Christophe Ricard Christoph Hellwig +Claudiu Beznea Colin Ian King Corey Minyard Damian Hobson-Garcia From 4aa8f278b94e1885a9b42c1c765c4edddfdc42f1 Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Tue, 31 Oct 2023 08:40:00 +0700 Subject: [PATCH 62/64] .mailmap: add address mapping for Tomeu Vizoso He's no longer working in Collabora (and his email address there bounces). Map it to his personal address. Link: https://lkml.kernel.org/r/20231031014009.22765-2-bagasdotme@gmail.com Signed-off-by: Bagas Sanjaya Acked-by: Tomeu Vizoso Cc: Bjorn Andersson Cc: Heiko Stuebner Cc: Jakub Kicinski Cc: Jens Axboe Cc: Konrad Dybcio Cc: Stephen Hemminger Signed-off-by: Andrew Morton --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index b325732e31eb..27beb64673b8 100644 --- a/.mailmap +++ b/.mailmap @@ -567,6 +567,7 @@ Takashi YOSHII Tamizh Chelvam Raja Taniya Das Tejun Heo +Tomeu Vizoso Thomas Graf Thomas Körper Thomas Pedersen From 16501630bdeb107141a0139ddc33f92ab5582c6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20L=C3=A9ger?= Date: Tue, 31 Oct 2023 13:49:04 +0000 Subject: [PATCH 63/64] scripts/gdb: fix usage of MOD_TEXT not defined when CONFIG_MODULES=n MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MOD_TEXT is only defined if CONFIG_MODULES=y which lead to loading failure of the gdb scripts when kernel is built without CONFIG_MODULES=y: Reading symbols from vmlinux... Traceback (most recent call last): File "/foo/vmlinux-gdb.py", line 25, in import linux.constants File "/foo/scripts/gdb/linux/constants.py", line 14, in LX_MOD_TEXT = gdb.parse_and_eval("MOD_TEXT") ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ gdb.error: No symbol "MOD_TEXT" in current context. Add a conditional check on CONFIG_MODULES to fix this error. Link: https://lkml.kernel.org/r/20231031134848.119391-1-da.gomez@samsung.com Fixes: b4aff7513df3 ("scripts/gdb: use mem instead of core_layout to get the module address") Signed-off-by: Clément Léger Tested-by: Daniel Gomez Signed-off-by: Daniel Gomez Cc: Jan Kiszka Cc: Kieran Bingham Cc: Luis Chamberlain Cc: Pankaj Raghav Signed-off-by: Andrew Morton --- scripts/gdb/linux/constants.py.in | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/scripts/gdb/linux/constants.py.in b/scripts/gdb/linux/constants.py.in index e3517d4ab8ec..04c87b570aab 100644 --- a/scripts/gdb/linux/constants.py.in +++ b/scripts/gdb/linux/constants.py.in @@ -66,10 +66,11 @@ LX_GDBPARSED(IRQD_LEVEL) LX_GDBPARSED(IRQ_HIDDEN) /* linux/module.h */ -LX_GDBPARSED(MOD_TEXT) -LX_GDBPARSED(MOD_DATA) -LX_GDBPARSED(MOD_RODATA) -LX_GDBPARSED(MOD_RO_AFTER_INIT) +if IS_BUILTIN(CONFIG_MODULES): + LX_GDBPARSED(MOD_TEXT) + LX_GDBPARSED(MOD_DATA) + LX_GDBPARSED(MOD_RODATA) + LX_GDBPARSED(MOD_RO_AFTER_INIT) /* linux/mount.h */ LX_VALUE(MNT_NOSUID) From 6620999f0d41e4fd6f047727936a964c3399d249 Mon Sep 17 00:00:00 2001 From: Ben Wolsieffer Date: Tue, 31 Oct 2023 16:22:36 -0400 Subject: [PATCH 64/64] scripts/gdb/vmalloc: disable on no-MMU vmap_area does not exist on no-MMU, therefore the GDB scripts fail to load: Traceback (most recent call last): File "<...>/vmlinux-gdb.py", line 51, in import linux.vmalloc File "<...>/scripts/gdb/linux/vmalloc.py", line 14, in vmap_area_ptr_type = vmap_area_type.get_type().pointer() ^^^^^^^^^^^^^^^^^^^^^^^^^ File "<...>/scripts/gdb/linux/utils.py", line 28, in get_type self._type = gdb.lookup_type(self._name) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ gdb.error: No struct type named vmap_area. To fix this, disable the command and add an informative error message if CONFIG_MMU is not defined, following the example of lx-slabinfo. Link: https://lkml.kernel.org/r/20231031202235.2655333-2-ben.wolsieffer@hefring.com Fixes: 852622bf3616 ("scripts/gdb/vmalloc: add vmallocinfo support") Signed-off-by: Ben Wolsieffer Cc: Jan Kiszka Cc: Kieran Bingham Cc: Kuan-Ying Lee Cc: Signed-off-by: Andrew Morton --- scripts/gdb/linux/constants.py.in | 1 + scripts/gdb/linux/vmalloc.py | 8 ++++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/scripts/gdb/linux/constants.py.in b/scripts/gdb/linux/constants.py.in index 04c87b570aab..e810e0c27ff1 100644 --- a/scripts/gdb/linux/constants.py.in +++ b/scripts/gdb/linux/constants.py.in @@ -158,3 +158,4 @@ LX_CONFIG(CONFIG_STACKDEPOT) LX_CONFIG(CONFIG_PAGE_OWNER) LX_CONFIG(CONFIG_SLUB_DEBUG) LX_CONFIG(CONFIG_SLAB_FREELIST_HARDENED) +LX_CONFIG(CONFIG_MMU) diff --git a/scripts/gdb/linux/vmalloc.py b/scripts/gdb/linux/vmalloc.py index 48e4a4fae7bb..d3c8a0274d1e 100644 --- a/scripts/gdb/linux/vmalloc.py +++ b/scripts/gdb/linux/vmalloc.py @@ -10,8 +10,9 @@ import gdb import re from linux import lists, utils, stackdepot, constants, mm -vmap_area_type = utils.CachedType('struct vmap_area') -vmap_area_ptr_type = vmap_area_type.get_type().pointer() +if constants.LX_CONFIG_MMU: + vmap_area_type = utils.CachedType('struct vmap_area') + vmap_area_ptr_type = vmap_area_type.get_type().pointer() def is_vmalloc_addr(x): pg_ops = mm.page_ops().ops @@ -25,6 +26,9 @@ class LxVmallocInfo(gdb.Command): super(LxVmallocInfo, self).__init__("lx-vmallocinfo", gdb.COMMAND_DATA) def invoke(self, arg, from_tty): + if not constants.LX_CONFIG_MMU: + raise gdb.GdbError("Requires MMU support") + vmap_area_list = gdb.parse_and_eval('vmap_area_list') for vmap_area in lists.list_for_each_entry(vmap_area_list, vmap_area_ptr_type, "list"): if not vmap_area['vm']: