From 0af96a024f524a5318485cbada73ab7d874895d4 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Thu, 28 Apr 2022 23:17:25 -0700 Subject: [PATCH 01/65] ia64: fix typos in comments Various spelling mistakes in comments. Detected with the help of Coccinelle. Link: https://lkml.kernel.org/r/20220318103729.157574-1-Julia.Lawall@inria.fr Signed-off-by: Julia Lawall Signed-off-by: Andrew Morton --- arch/ia64/kernel/palinfo.c | 2 +- arch/ia64/kernel/traps.c | 2 +- arch/ia64/mm/init.c | 2 +- arch/ia64/mm/tlb.c | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/ia64/kernel/palinfo.c b/arch/ia64/kernel/palinfo.c index 64189f04c1a4..b9ae093bfe37 100644 --- a/arch/ia64/kernel/palinfo.c +++ b/arch/ia64/kernel/palinfo.c @@ -120,7 +120,7 @@ static const char *mem_attrib[]={ * Input: * - a pointer to a buffer to hold the string * - a 64-bit vector - * Ouput: + * Output: * - a pointer to the end of the buffer * */ diff --git a/arch/ia64/kernel/traps.c b/arch/ia64/kernel/traps.c index 753642366e12..53735b1d1be3 100644 --- a/arch/ia64/kernel/traps.c +++ b/arch/ia64/kernel/traps.c @@ -309,7 +309,7 @@ handle_fpu_swa (int fp_fault, struct pt_regs *regs, unsigned long isr) /* * Lower 4 bits are used as a count. Upper bits are a sequence * number that is updated when count is reset. The cmpxchg will - * fail is seqno has changed. This minimizes mutiple cpus + * fail is seqno has changed. This minimizes multiple cpus * resetting the count. */ if (current_jiffies > last.time) diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 5d165607bf35..7ae1244ed8ec 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -451,7 +451,7 @@ mem_init (void) memblock_free_all(); /* - * For fsyscall entrpoints with no light-weight handler, use the ordinary + * For fsyscall entrypoints with no light-weight handler, use the ordinary * (heavy-weight) handler, but mark it by setting bit 0, so the fsyscall entry * code can tell them apart. */ diff --git a/arch/ia64/mm/tlb.c b/arch/ia64/mm/tlb.c index 135b5135cace..ca060e7a2a46 100644 --- a/arch/ia64/mm/tlb.c +++ b/arch/ia64/mm/tlb.c @@ -174,7 +174,7 @@ __setup("nptcg=", set_nptcg); * override table (in which case we should ignore the value from * PAL_VM_SUMMARY). * - * Kernel parameter "nptcg=" overrides maximum number of simultanesous ptc.g + * Kernel parameter "nptcg=" overrides maximum number of simultaneous ptc.g * purges defined in either PAL_VM_SUMMARY or PAL override table. In this case, * we should ignore the value from either PAL_VM_SUMMARY or PAL override table. * @@ -516,7 +516,7 @@ found: if (i >= per_cpu(ia64_tr_num, cpu)) return -EBUSY; - /*Record tr info for mca hander use!*/ + /*Record tr info for mca handler use!*/ if (i > per_cpu(ia64_tr_used, cpu)) per_cpu(ia64_tr_used, cpu) = i; From 72a4fd6a7f032b921b1c195eb42a038ab9026021 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Thu, 28 Apr 2022 23:17:25 -0700 Subject: [PATCH 02/65] ia64: ptrace: fix typos in comments Various spelling mistakes in comments. Detected with the help of Coccinelle. Link: https://lkml.kernel.org/r/20220318103729.157574-23-Julia.Lawall@inria.fr Signed-off-by: Julia Lawall Signed-off-by: Andrew Morton --- arch/ia64/kernel/ptrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c index a19acd9f5e1f..4fc6e38a8459 100644 --- a/arch/ia64/kernel/ptrace.c +++ b/arch/ia64/kernel/ptrace.c @@ -2025,7 +2025,7 @@ static void syscall_get_args_cb(struct unw_frame_info *info, void *data) * - epsinstruction: cfm is set by br.call * locals don't exist. * - * For both cases argguments are reachable in cfm.sof - cfm.sol. + * For both cases arguments are reachable in cfm.sof - cfm.sol. * CFM: [ ... | sor: 17..14 | sol : 13..7 | sof : 6..0 ] */ cfm = pt->cr_ifs; From bd7155a0282e2f4e14260c30272d6472253e6564 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Benjamin=20St=C3=BCrz?= Date: Thu, 28 Apr 2022 23:17:25 -0700 Subject: [PATCH 03/65] ia64: replace comments with C99 initializers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This replaces comments with C99's designated initializers because the kernel supports them now. Link: https://lkml.kernel.org/r/20220326165909.506926-3-benni@stuerz.xyz Signed-off-by: Benjamin Stürz Signed-off-by: Andrew Morton --- arch/ia64/kernel/kprobes.c | 64 +++++++++++++++++++------------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c index 1a7bab1c5d7c..ca34e51e84b4 100644 --- a/arch/ia64/kernel/kprobes.c +++ b/arch/ia64/kernel/kprobes.c @@ -29,38 +29,38 @@ struct kretprobe_blackpoint kretprobe_blacklist[] = {{NULL, NULL}}; enum instruction_type {A, I, M, F, B, L, X, u}; static enum instruction_type bundle_encoding[32][3] = { - { M, I, I }, /* 00 */ - { M, I, I }, /* 01 */ - { M, I, I }, /* 02 */ - { M, I, I }, /* 03 */ - { M, L, X }, /* 04 */ - { M, L, X }, /* 05 */ - { u, u, u }, /* 06 */ - { u, u, u }, /* 07 */ - { M, M, I }, /* 08 */ - { M, M, I }, /* 09 */ - { M, M, I }, /* 0A */ - { M, M, I }, /* 0B */ - { M, F, I }, /* 0C */ - { M, F, I }, /* 0D */ - { M, M, F }, /* 0E */ - { M, M, F }, /* 0F */ - { M, I, B }, /* 10 */ - { M, I, B }, /* 11 */ - { M, B, B }, /* 12 */ - { M, B, B }, /* 13 */ - { u, u, u }, /* 14 */ - { u, u, u }, /* 15 */ - { B, B, B }, /* 16 */ - { B, B, B }, /* 17 */ - { M, M, B }, /* 18 */ - { M, M, B }, /* 19 */ - { u, u, u }, /* 1A */ - { u, u, u }, /* 1B */ - { M, F, B }, /* 1C */ - { M, F, B }, /* 1D */ - { u, u, u }, /* 1E */ - { u, u, u }, /* 1F */ + [0x00] = { M, I, I }, + [0x01] = { M, I, I }, + [0x02] = { M, I, I }, + [0x03] = { M, I, I }, + [0x04] = { M, L, X }, + [0x05] = { M, L, X }, + [0x06] = { u, u, u }, + [0x07] = { u, u, u }, + [0x08] = { M, M, I }, + [0x09] = { M, M, I }, + [0x0A] = { M, M, I }, + [0x0B] = { M, M, I }, + [0x0C] = { M, F, I }, + [0x0D] = { M, F, I }, + [0x0E] = { M, M, F }, + [0x0F] = { M, M, F }, + [0x10] = { M, I, B }, + [0x11] = { M, I, B }, + [0x12] = { M, B, B }, + [0x13] = { M, B, B }, + [0x14] = { u, u, u }, + [0x15] = { u, u, u }, + [0x16] = { B, B, B }, + [0x17] = { B, B, B }, + [0x18] = { M, M, B }, + [0x19] = { M, M, B }, + [0x1A] = { u, u, u }, + [0x1B] = { u, u, u }, + [0x1C] = { M, F, B }, + [0x1D] = { M, F, B }, + [0x1E] = { u, u, u }, + [0x1F] = { u, u, u }, }; /* Insert a long branch code */ From 3af8acf6aff2a98731522b52927429760f0b8006 Mon Sep 17 00:00:00 2001 From: Schspa Shi Date: Fri, 29 Apr 2022 14:37:57 -0700 Subject: [PATCH 04/65] scripts/decode_stacktrace.sh: support old bash version Old bash version don't support associative array variables. Avoid to use associative array variables to avoid error. Without this, old bash version will report error as fellowing [ 15.954042] Kernel panic - not syncing: sysrq triggered crash [ 15.955252] CPU: 1 PID: 167 Comm: sh Not tainted 5.18.0-rc1-00208-gb7d075db2fd5 #4 [ 15.956472] Hardware name: Hobot J5 Virtual development board (DT) [ 15.957856] Call trace: ./scripts/decode_stacktrace.sh: line 128: ,dump_backtrace: syntax error: operand expected (error token is ",dump_backtrace") Link: https://lkml.kernel.org/r/20220409180331.24047-1-schspa@gmail.com Signed-off-by: Schspa Shi Cc: Stephen Boyd Signed-off-by: Andrew Morton --- scripts/decode_stacktrace.sh | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/scripts/decode_stacktrace.sh b/scripts/decode_stacktrace.sh index 5fbad61fe490..7075e26ab2c4 100755 --- a/scripts/decode_stacktrace.sh +++ b/scripts/decode_stacktrace.sh @@ -45,8 +45,13 @@ else fi fi -declare -A cache -declare -A modcache +declare aarray_support=true +declare -A cache 2>/dev/null +if [[ $? != 0 ]]; then + aarray_support=false +else + declare -A modcache +fi find_module() { if [[ -n $debuginfod ]] ; then @@ -97,7 +102,7 @@ parse_symbol() { if [[ $module == "" ]] ; then local objfile=$vmlinux - elif [[ "${modcache[$module]+isset}" == "isset" ]]; then + elif [[ $aarray_support == true && "${modcache[$module]+isset}" == "isset" ]]; then local objfile=${modcache[$module]} else local objfile=$(find_module) @@ -105,7 +110,9 @@ parse_symbol() { echo "WARNING! Modules path isn't set, but is needed to parse this symbol" >&2 return fi - modcache[$module]=$objfile + if [[ $aarray_support == true ]]; then + modcache[$module]=$objfile + fi fi # Remove the englobing parenthesis @@ -125,7 +132,7 @@ parse_symbol() { # Use 'nm vmlinux' to figure out the base address of said symbol. # It's actually faster to call it every time than to load it # all into bash. - if [[ "${cache[$module,$name]+isset}" == "isset" ]]; then + if [[ $aarray_support == true && "${cache[$module,$name]+isset}" == "isset" ]]; then local base_addr=${cache[$module,$name]} else local base_addr=$(nm "$objfile" 2>/dev/null | awk '$3 == "'$name'" && ($2 == "t" || $2 == "T") {print $1; exit}') @@ -133,7 +140,9 @@ parse_symbol() { # address not found return fi - cache[$module,$name]="$base_addr" + if [[ $aarray_support == true ]]; then + cache[$module,$name]="$base_addr" + fi fi # Let's start doing the math to get the exact address into the # symbol. First, strip out the symbol total length. @@ -149,11 +158,13 @@ parse_symbol() { # Pass it to addr2line to get filename and line number # Could get more than one result - if [[ "${cache[$module,$address]+isset}" == "isset" ]]; then + if [[ $aarray_support == true && "${cache[$module,$address]+isset}" == "isset" ]]; then local code=${cache[$module,$address]} else local code=$(${CROSS_COMPILE}addr2line -i -e "$objfile" "$address" 2>/dev/null) - cache[$module,$address]=$code + if [[ $aarray_support == true ]]; then + cache[$module,$address]=$code + fi fi # addr2line doesn't return a proper error code if it fails, so From dec81a532027a77bd52f9bd8d8b3230843533d3f Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Fri, 29 Apr 2022 14:37:57 -0700 Subject: [PATCH 05/65] scripts/bloat-o-meter: filter out vermagic as it is not relevant Seeing it as a false positive increase at the top is just noise: linux-head$./scripts/bloat-o-meter ../pre/vmlinux ../post/vmlinux add/remove: 0/571 grow/shrink: 1/9 up/down: 20/-64662 (-64642) Function old new delta vermagic 49 69 +20 Since it really doesn't "grow", it makes sense to filter it out. Link: https://lkml.kernel.org/r/20220428035824.7934-1-paul.gortmaker@windriver.com Signed-off-by: Paul Gortmaker Signed-off-by: Andrew Morton --- scripts/bloat-o-meter | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/bloat-o-meter b/scripts/bloat-o-meter index dcd8d8750b8b..4dd6a804ce41 100755 --- a/scripts/bloat-o-meter +++ b/scripts/bloat-o-meter @@ -36,6 +36,7 @@ def getsizes(file, format): if name.startswith("__se_compat_sys"): continue if name.startswith("__addressable_"): continue if name == "linux_banner": continue + if name == "vermagic": continue # statics and some other optimizations adds random .NUMBER name = re_NUMBER.sub('', name) sym[name] = sym.get(name, 0) + int(size, 16) From 81cd1ae909e0080eb41457766f0f448fd8ab9979 Mon Sep 17 00:00:00 2001 From: Jakob Koschel Date: Fri, 29 Apr 2022 14:37:57 -0700 Subject: [PATCH 06/65] ocfs2: replace usage of found with dedicated list iterator variable To move the list iterator variable into the list_for_each_entry_*() macro in the future it should be avoided to use the list iterator variable after the loop body. To *never* use the list iterator variable after the loop it was concluded to use a separate iterator variable instead of a found boolean [1]. This removes the need to use a found variable and simply checking if the variable was set, can determine if the break/goto was hit. Link: https://lore.kernel.org/all/CAHk-=wgRr_D8CB-D9Kg-c=EHreAsk5SqXPwr9Y7k9sA6cWXJ6w@mail.gmail.com/ Link: https://lkml.kernel.org/r/20220324071650.61168-1-jakobkoschel@gmail.com Signed-off-by: Jakob Koschel Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/dlm/dlmunlock.c | 21 ++++++++++----------- fs/ocfs2/quota_local.c | 10 +++++----- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index 61103b2d69fb..7318e4794ef9 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -392,9 +392,9 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data, struct dlm_ctxt *dlm = data; struct dlm_unlock_lock *unlock = (struct dlm_unlock_lock *)msg->buf; struct dlm_lock_resource *res = NULL; - struct dlm_lock *lock = NULL; + struct dlm_lock *lock = NULL, *iter; enum dlm_status status = DLM_NORMAL; - int found = 0, i; + int i; struct dlm_lockstatus *lksb = NULL; int ignore; u32 flags; @@ -437,7 +437,6 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data, } queue=&res->granted; - found = 0; spin_lock(&res->spinlock); if (res->state & DLM_LOCK_RES_RECOVERING) { spin_unlock(&res->spinlock); @@ -461,21 +460,21 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data, } for (i=0; i<3; i++) { - list_for_each_entry(lock, queue, list) { - if (lock->ml.cookie == unlock->cookie && - lock->ml.node == unlock->node_idx) { - dlm_lock_get(lock); - found = 1; + list_for_each_entry(iter, queue, list) { + if (iter->ml.cookie == unlock->cookie && + iter->ml.node == unlock->node_idx) { + dlm_lock_get(iter); + lock = iter; break; } } - if (found) + if (lock) break; /* scan granted -> converting -> blocked queues */ queue++; } spin_unlock(&res->spinlock); - if (!found) { + if (!lock) { status = DLM_IVLOCKID; goto not_found; } @@ -505,7 +504,7 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data, dlm_kick_thread(dlm, res); not_found: - if (!found) + if (!lock) mlog(ML_ERROR, "failed to find lock to unlock! " "cookie=%u:%llu\n", dlm_get_lock_cookie_node(be64_to_cpu(unlock->cookie)), diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index b1a8b046f4c2..5022b3e9bfcd 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -921,19 +921,19 @@ static struct ocfs2_quota_chunk *ocfs2_find_free_entry(struct super_block *sb, { struct mem_dqinfo *info = sb_dqinfo(sb, type); struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv; - struct ocfs2_quota_chunk *chunk; + struct ocfs2_quota_chunk *chunk = NULL, *iter; struct ocfs2_local_disk_chunk *dchunk; int found = 0, len; - list_for_each_entry(chunk, &oinfo->dqi_chunk, qc_chunk) { + list_for_each_entry(iter, &oinfo->dqi_chunk, qc_chunk) { dchunk = (struct ocfs2_local_disk_chunk *) - chunk->qc_headerbh->b_data; + iter->qc_headerbh->b_data; if (le32_to_cpu(dchunk->dqc_free) > 0) { - found = 1; + chunk = iter; break; } } - if (!found) + if (!chunk) return NULL; if (chunk->qc_num < oinfo->dqi_chunks - 1) { From b02da32b613f989b73c88113db16ab47de11a3fd Mon Sep 17 00:00:00 2001 From: Jakob Koschel Date: Fri, 29 Apr 2022 14:37:57 -0700 Subject: [PATCH 07/65] ocfs2: remove usage of list iterator variable after the loop body To move the list iterator variable into the list_for_each_entry_*() macro in the future it should be avoided to use the list iterator variable after the loop body. To *never* use the list iterator variable after the loop it was concluded to use a separate iterator variable [1]. Link: https://lore.kernel.org/all/CAHk-=wgRr_D8CB-D9Kg-c=EHreAsk5SqXPwr9Y7k9sA6cWXJ6w@mail.gmail.com/ Link: https://lkml.kernel.org/r/20220322105014.3626194-1-jakobkoschel@gmail.com Signed-off-by: Jakob Koschel Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/dlm/dlmdebug.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index d442cf5dda8a..be5e9ed7da8d 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -541,7 +541,7 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos) struct debug_lockres *dl = m->private; struct dlm_ctxt *dlm = dl->dl_ctxt; struct dlm_lock_resource *oldres = dl->dl_res; - struct dlm_lock_resource *res = NULL; + struct dlm_lock_resource *res = NULL, *iter; struct list_head *track_list; spin_lock(&dlm->track_lock); @@ -556,11 +556,11 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos) } } - list_for_each_entry(res, track_list, tracking) { - if (&res->tracking == &dlm->tracking_list) - res = NULL; - else - dlm_lockres_get(res); + list_for_each_entry(iter, track_list, tracking) { + if (&iter->tracking != &dlm->tracking_list) { + dlm_lockres_get(iter); + res = iter; + } break; } spin_unlock(&dlm->track_lock); From bb20b31dee1a6c329c2f721fbe21c51945cdfc29 Mon Sep 17 00:00:00 2001 From: Heming Zhao via Ocfs2-devel Date: Fri, 29 Apr 2022 14:37:58 -0700 Subject: [PATCH 08/65] ocfs2: fix mounting crash if journal is not alloced Patch series "rewrite error handling during mounting stage". This patch (of 5): After commit da5e7c87827e8 ("ocfs2: cleanup journal init and shutdown"), journal init later than before, it makes NULL pointer access in free routine. Crash flow: ocfs2_fill_super + ocfs2_mount_volume | + ocfs2_dlm_init //fail & return, osb->journal is NULL. | + ... | + ocfs2_check_volume //no chance to init osb->journal | + ... + ocfs2_dismount_volume ocfs2_release_system_inodes ... evict ... ocfs2_clear_inode ocfs2_checkpoint_inode ocfs2_ci_fully_checkpointed time_after(journal->j_trans_id, ci->ci_last_trans) + journal is empty, crash! For fixing, there are three solutions: 1> Partly revert commit da5e7c87827e8 For avoiding kernel crash, this make sense for us. We only concerned whether there has any non-system inode access before dlm init. The answer is NO. And all journal replay/recovery handling happen after dlm & journal init done. So this method is not graceful but workable. 2> Add osb->journal check in free inode routine (eg ocfs2_clear_inode) The fix code is special for mounting phase, but it will continue working after mounting stage. In another word, this method adds useless code in normal inode free flow. 3> Do directly free inode in mounting phase This method is brutal/complex and may introduce unsafe code, currently maintainer didn't like. At last, we chose method <1> and did partly reverted job. We reverted journal init codes, and kept cleanup codes flow. Link: https://lkml.kernel.org/r/20220424130952.2436-1-heming.zhao@suse.com Link: https://lkml.kernel.org/r/20220424130952.2436-2-heming.zhao@suse.com Fixes: da5e7c87827e8 ("ocfs2: cleanup journal init and shutdown") Signed-off-by: Heming Zhao Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/inode.c | 4 ++-- fs/ocfs2/journal.c | 33 +++++++++++++++++++++++---------- fs/ocfs2/journal.h | 2 ++ fs/ocfs2/super.c | 15 +++++++++++++++ 4 files changed, 42 insertions(+), 12 deletions(-) diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 5739dc301569..bb116c39b581 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -125,6 +125,7 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, struct inode *inode = NULL; struct super_block *sb = osb->sb; struct ocfs2_find_inode_args args; + journal_t *journal = osb->journal->j_journal; trace_ocfs2_iget_begin((unsigned long long)blkno, flags, sysfile_type); @@ -171,11 +172,10 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, * part of the transaction - the inode could have been reclaimed and * now it is reread from disk. */ - if (osb->journal) { + if (journal) { transaction_t *transaction; tid_t tid; struct ocfs2_inode_info *oi = OCFS2_I(inode); - journal_t *journal = osb->journal->j_journal; read_lock(&journal->j_state_lock); if (journal->j_running_transaction) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 1887a2708709..fa87d89cf754 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -810,22 +810,20 @@ void ocfs2_set_journal_params(struct ocfs2_super *osb) write_unlock(&journal->j_state_lock); } -int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty) +/* + * alloc & initialize skeleton for journal structure. + * ocfs2_journal_init() will make fs have journal ability. + */ +int ocfs2_journal_alloc(struct ocfs2_super *osb) { - int status = -1; - struct inode *inode = NULL; /* the journal inode */ - journal_t *j_journal = NULL; - struct ocfs2_journal *journal = NULL; - struct ocfs2_dinode *di = NULL; - struct buffer_head *bh = NULL; - int inode_lock = 0; + int status = 0; + struct ocfs2_journal *journal; - /* initialize our journal structure */ journal = kzalloc(sizeof(struct ocfs2_journal), GFP_KERNEL); if (!journal) { mlog(ML_ERROR, "unable to alloc journal\n"); status = -ENOMEM; - goto done; + goto bail; } osb->journal = journal; journal->j_osb = osb; @@ -839,6 +837,21 @@ int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty) INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery); journal->j_state = OCFS2_JOURNAL_FREE; +bail: + return status; +} + +int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty) +{ + int status = -1; + struct inode *inode = NULL; /* the journal inode */ + journal_t *j_journal = NULL; + struct ocfs2_journal *journal = osb->journal; + struct ocfs2_dinode *di = NULL; + struct buffer_head *bh = NULL; + int inode_lock = 0; + + BUG_ON(!journal); /* already have the inode for our journal */ inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE, osb->slot_num); diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 8dcb2f2cadbc..969d0aa28718 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -154,6 +154,7 @@ int ocfs2_compute_replay_slots(struct ocfs2_super *osb); * Journal Control: * Initialize, Load, Shutdown, Wipe a journal. * + * ocfs2_journal_alloc - Initialize skeleton for journal structure. * ocfs2_journal_init - Initialize journal structures in the OSB. * ocfs2_journal_load - Load the given journal off disk. Replay it if * there's transactions still in there. @@ -167,6 +168,7 @@ int ocfs2_compute_replay_slots(struct ocfs2_super *osb); * ocfs2_start_checkpoint - Kick the commit thread to do a checkpoint. */ void ocfs2_set_journal_params(struct ocfs2_super *osb); +int ocfs2_journal_alloc(struct ocfs2_super *osb); int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty); void ocfs2_journal_shutdown(struct ocfs2_super *osb); int ocfs2_journal_wipe(struct ocfs2_journal *journal, diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 477cdf94122e..311433c69a3f 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -2195,6 +2195,15 @@ static int ocfs2_initialize_super(struct super_block *sb, get_random_bytes(&osb->s_next_generation, sizeof(u32)); + /* + * FIXME + * This should be done in ocfs2_journal_init(), but any inode + * writes back operation will cause the filesystem to crash. + */ + status = ocfs2_journal_alloc(osb); + if (status < 0) + goto bail; + INIT_WORK(&osb->dquot_drop_work, ocfs2_drop_dquot_refs); init_llist_head(&osb->dquot_drop_list); @@ -2483,6 +2492,12 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb) kfree(osb->osb_orphan_wipes); kfree(osb->slot_recovery_generations); + /* FIXME + * This belongs in journal shutdown, but because we have to + * allocate osb->journal at the middle of ocfs2_initialize_super(), + * we free it here. + */ + kfree(osb->journal); kfree(osb->local_alloc_copy); kfree(osb->uuid_str); kfree(osb->vol_label); From 54bd3f7c5c3b6b6101673ec9c73457127c317bf9 Mon Sep 17 00:00:00 2001 From: Heming Zhao via Ocfs2-devel Date: Fri, 29 Apr 2022 14:37:58 -0700 Subject: [PATCH 09/65] ocfs2: change return type of ocfs2_resmap_init Since ocfs2_resmap_init() always return 0, change it to void. Link: https://lkml.kernel.org/r/20220424130952.2436-3-heming.zhao@suse.com Signed-off-by: Heming Zhao Reviewed-by: Joseph Qi Cc: Changwei Ge Cc: Gang He Cc: Joel Becker Cc: Jun Piao Cc: Junxiao Bi Cc: Mark Fasheh Signed-off-by: Andrew Morton --- fs/ocfs2/reservations.c | 4 +--- fs/ocfs2/reservations.h | 9 ++------- fs/ocfs2/super.c | 6 +----- 3 files changed, 4 insertions(+), 15 deletions(-) diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c index 769e466887b0..a9d1296d736d 100644 --- a/fs/ocfs2/reservations.c +++ b/fs/ocfs2/reservations.c @@ -198,7 +198,7 @@ void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv, resv->r_flags |= flags; } -int ocfs2_resmap_init(struct ocfs2_super *osb, +void ocfs2_resmap_init(struct ocfs2_super *osb, struct ocfs2_reservation_map *resmap) { memset(resmap, 0, sizeof(*resmap)); @@ -207,8 +207,6 @@ int ocfs2_resmap_init(struct ocfs2_super *osb, resmap->m_reservations = RB_ROOT; /* m_bitmap_len is initialized to zero by the above memset. */ INIT_LIST_HEAD(&resmap->m_lru); - - return 0; } static void ocfs2_resv_mark_lru(struct ocfs2_reservation_map *resmap, diff --git a/fs/ocfs2/reservations.h b/fs/ocfs2/reservations.h index 677c50663595..ec8101ef5717 100644 --- a/fs/ocfs2/reservations.h +++ b/fs/ocfs2/reservations.h @@ -73,15 +73,10 @@ void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap, /** * ocfs2_resmap_init() - Initialize fields of a reservations bitmap + * @osb: struct ocfs2_super to be saved in resmap * @resmap: struct ocfs2_reservation_map to initialize - * @obj: unused for now - * @ops: unused for now - * @max_bitmap_bytes: Maximum size of the bitmap (typically blocksize) - * - * Only possible return value other than '0' is -ENOMEM for failure to - * allocation mirror bitmap. */ -int ocfs2_resmap_init(struct ocfs2_super *osb, +void ocfs2_resmap_init(struct ocfs2_super *osb, struct ocfs2_reservation_map *resmap); /** diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 311433c69a3f..8014c690ef72 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -2110,11 +2110,7 @@ static int ocfs2_initialize_super(struct super_block *sb, init_waitqueue_head(&osb->osb_mount_event); - status = ocfs2_resmap_init(osb, &osb->osb_la_resmap); - if (status) { - mlog_errno(status); - goto bail; - } + ocfs2_resmap_init(osb, &osb->osb_la_resmap); osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL); if (!osb->vol_label) { From a8a986db64930b7d4cd4e4f68d8718bfa75c9528 Mon Sep 17 00:00:00 2001 From: Heming Zhao via Ocfs2-devel Date: Fri, 29 Apr 2022 14:37:58 -0700 Subject: [PATCH 10/65] ocfs2: ocfs2_initialize_super does cleanup job before return error After this patch, when error, ocfs2_fill_super doesn't take care to release resources which are allocated in ocfs2_initialize_super. Link: https://lkml.kernel.org/r/20220424130952.2436-4-heming.zhao@suse.com Signed-off-by: Heming Zhao Reviewed-by: Joseph Qi Cc: Changwei Ge Cc: Gang He Cc: Joel Becker Cc: Jun Piao Cc: Junxiao Bi Cc: Mark Fasheh Signed-off-by: Andrew Morton --- fs/ocfs2/super.c | 59 +++++++++++++++++++++++++++++++++--------------- 1 file changed, 41 insertions(+), 18 deletions(-) diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 8014c690ef72..758ea3313f88 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -2022,7 +2022,7 @@ static int ocfs2_initialize_super(struct super_block *sb, if (!osb) { status = -ENOMEM; mlog_errno(status); - goto bail; + goto out; } sb->s_fs_info = osb; @@ -2083,7 +2083,7 @@ static int ocfs2_initialize_super(struct super_block *sb, mlog(ML_ERROR, "Invalid number of node slots (%u)\n", osb->max_slots); status = -EINVAL; - goto bail; + goto out; } ocfs2_orphan_scan_init(osb); @@ -2092,7 +2092,7 @@ static int ocfs2_initialize_super(struct super_block *sb, if (status) { mlog(ML_ERROR, "Unable to initialize recovery state\n"); mlog_errno(status); - goto bail; + goto out; } init_waitqueue_head(&osb->checkpoint_event); @@ -2116,7 +2116,7 @@ static int ocfs2_initialize_super(struct super_block *sb, if (!osb->vol_label) { mlog(ML_ERROR, "unable to alloc vol label\n"); status = -ENOMEM; - goto bail; + goto out_recovery_map; } osb->slot_recovery_generations = @@ -2125,7 +2125,7 @@ static int ocfs2_initialize_super(struct super_block *sb, if (!osb->slot_recovery_generations) { status = -ENOMEM; mlog_errno(status); - goto bail; + goto out_vol_label; } init_waitqueue_head(&osb->osb_wipe_event); @@ -2135,7 +2135,7 @@ static int ocfs2_initialize_super(struct super_block *sb, if (!osb->osb_orphan_wipes) { status = -ENOMEM; mlog_errno(status); - goto bail; + goto out_slot_recovery_gen; } osb->osb_rf_lock_tree = RB_ROOT; @@ -2151,13 +2151,13 @@ static int ocfs2_initialize_super(struct super_block *sb, mlog(ML_ERROR, "couldn't mount because of unsupported " "optional features (%x).\n", i); status = -EINVAL; - goto bail; + goto out_orphan_wipes; } if (!sb_rdonly(osb->sb) && (i = OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP))) { mlog(ML_ERROR, "couldn't mount RDWR because of " "unsupported optional features (%x).\n", i); status = -EINVAL; - goto bail; + goto out_orphan_wipes; } if (ocfs2_clusterinfo_valid(osb)) { @@ -2178,7 +2178,7 @@ static int ocfs2_initialize_super(struct super_block *sb, "cluster stack label (%s) \n", osb->osb_cluster_stack); status = -EINVAL; - goto bail; + goto out_orphan_wipes; } memcpy(osb->osb_cluster_name, OCFS2_RAW_SB(di)->s_cluster_info.ci_cluster, @@ -2198,7 +2198,7 @@ static int ocfs2_initialize_super(struct super_block *sb, */ status = ocfs2_journal_alloc(osb); if (status < 0) - goto bail; + goto out_orphan_wipes; INIT_WORK(&osb->dquot_drop_work, ocfs2_drop_dquot_refs); init_llist_head(&osb->dquot_drop_list); @@ -2213,7 +2213,7 @@ static int ocfs2_initialize_super(struct super_block *sb, mlog(ML_ERROR, "Volume has invalid cluster size (%d)\n", osb->s_clustersize); status = -EINVAL; - goto bail; + goto out_journal; } total_blocks = ocfs2_clusters_to_blocks(osb->sb, @@ -2225,14 +2225,14 @@ static int ocfs2_initialize_super(struct super_block *sb, mlog(ML_ERROR, "Volume too large " "to mount safely on this system"); status = -EFBIG; - goto bail; + goto out_journal; } if (ocfs2_setup_osb_uuid(osb, di->id2.i_super.s_uuid, sizeof(di->id2.i_super.s_uuid))) { mlog(ML_ERROR, "Out of memory trying to setup our uuid.\n"); status = -ENOMEM; - goto bail; + goto out_journal; } strlcpy(osb->vol_label, di->id2.i_super.s_label, @@ -2252,7 +2252,7 @@ static int ocfs2_initialize_super(struct super_block *sb, if (!osb->osb_dlm_debug) { status = -ENOMEM; mlog_errno(status); - goto bail; + goto out_uuid_str; } atomic_set(&osb->vol_state, VOLUME_INIT); @@ -2261,7 +2261,7 @@ static int ocfs2_initialize_super(struct super_block *sb, status = ocfs2_init_global_system_inodes(osb); if (status < 0) { mlog_errno(status); - goto bail; + goto out_dlm_out; } /* @@ -2272,7 +2272,7 @@ static int ocfs2_initialize_super(struct super_block *sb, if (!inode) { status = -EINVAL; mlog_errno(status); - goto bail; + goto out_system_inodes; } osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno; @@ -2285,16 +2285,39 @@ static int ocfs2_initialize_super(struct super_block *sb, status = ocfs2_init_slot_info(osb); if (status < 0) { mlog_errno(status); - goto bail; + goto out_system_inodes; } osb->ocfs2_wq = alloc_ordered_workqueue("ocfs2_wq", WQ_MEM_RECLAIM); if (!osb->ocfs2_wq) { status = -ENOMEM; mlog_errno(status); + goto out_slot_info; } -bail: + return status; + +out_slot_info: + ocfs2_free_slot_info(osb); +out_system_inodes: + ocfs2_release_system_inodes(osb); +out_dlm_out: + ocfs2_put_dlm_debug(osb->osb_dlm_debug); +out_uuid_str: + kfree(osb->uuid_str); +out_journal: + kfree(osb->journal); +out_orphan_wipes: + kfree(osb->osb_orphan_wipes); +out_slot_recovery_gen: + kfree(osb->slot_recovery_generations); +out_vol_label: + kfree(osb->vol_label); +out_recovery_map: + kfree(osb->recovery_map); +out: + kfree(osb); + sb->s_fs_info = NULL; return status; } From 0737e01de9c411e4db87dcedf4a9789d41b1c5c1 Mon Sep 17 00:00:00 2001 From: Heming Zhao via Ocfs2-devel Date: Fri, 29 Apr 2022 14:37:58 -0700 Subject: [PATCH 11/65] ocfs2: ocfs2_mount_volume does cleanup job before return error After this patch, when error, ocfs2_fill_super doesn't take care to release resources which are allocated in ocfs2_mount_volume. Link: https://lkml.kernel.org/r/20220424130952.2436-5-heming.zhao@suse.com Signed-off-by: Heming Zhao Reviewed-by: Joseph Qi Cc: Changwei Ge Cc: Gang He Cc: Joel Becker Cc: Jun Piao Cc: Junxiao Bi Cc: Mark Fasheh Signed-off-by: Andrew Morton --- fs/ocfs2/super.c | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 758ea3313f88..1cf18ed8cf1b 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1803,11 +1803,10 @@ static int ocfs2_get_sector(struct super_block *sb, static int ocfs2_mount_volume(struct super_block *sb) { int status = 0; - int unlock_super = 0; struct ocfs2_super *osb = OCFS2_SB(sb); if (ocfs2_is_hard_readonly(osb)) - goto leave; + goto out; mutex_init(&osb->obs_trim_fs_mutex); @@ -1817,44 +1816,56 @@ static int ocfs2_mount_volume(struct super_block *sb) if (status == -EBADR && ocfs2_userspace_stack(osb)) mlog(ML_ERROR, "couldn't mount because cluster name on" " disk does not match the running cluster name.\n"); - goto leave; + goto out; } status = ocfs2_super_lock(osb, 1); if (status < 0) { mlog_errno(status); - goto leave; + goto out_dlm; } - unlock_super = 1; /* This will load up the node map and add ourselves to it. */ status = ocfs2_find_slot(osb); if (status < 0) { mlog_errno(status); - goto leave; + goto out_super_lock; } /* load all node-local system inodes */ status = ocfs2_init_local_system_inodes(osb); if (status < 0) { mlog_errno(status); - goto leave; + goto out_super_lock; } status = ocfs2_check_volume(osb); if (status < 0) { mlog_errno(status); - goto leave; + goto out_system_inodes; } status = ocfs2_truncate_log_init(osb); - if (status < 0) + if (status < 0) { mlog_errno(status); + goto out_system_inodes; + } -leave: - if (unlock_super) - ocfs2_super_unlock(osb, 1); + ocfs2_super_unlock(osb, 1); + return 0; +out_system_inodes: + if (osb->local_alloc_state == OCFS2_LA_ENABLED) + ocfs2_shutdown_local_alloc(osb); + ocfs2_release_system_inodes(osb); + /* before journal shutdown, we should release slot_info */ + ocfs2_free_slot_info(osb); + ocfs2_journal_shutdown(osb); +out_super_lock: + ocfs2_super_unlock(osb, 1); +out_dlm: + ocfs2_dlm_shutdown(osb, 0); +out: return status; } From f1e75d128b46e3b066e7b2e7cfca10491109d44d Mon Sep 17 00:00:00 2001 From: Heming Zhao via Ocfs2-devel Date: Fri, 29 Apr 2022 14:37:58 -0700 Subject: [PATCH 12/65] ocfs2: rewrite error handling of ocfs2_fill_super Current ocfs2_fill_super() uses one goto label "read_super_error" to handle all error cases. And with previous serial patches, the error handling should fork more branches to handle different error cases. This patch rewrite the error handling of ocfs2_fill_super. Link: https://lkml.kernel.org/r/20220424130952.2436-6-heming.zhao@suse.com Signed-off-by: Heming Zhao Reviewed-by: Joseph Qi Cc: Changwei Ge Cc: Gang He Cc: Joel Becker Cc: Jun Piao Cc: Junxiao Bi Cc: Mark Fasheh Signed-off-by: Andrew Morton --- fs/ocfs2/super.c | 67 +++++++++++++++++++++++------------------------- 1 file changed, 32 insertions(+), 35 deletions(-) diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 1cf18ed8cf1b..f7298816d8d9 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -989,28 +989,27 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) if (!ocfs2_parse_options(sb, data, &parsed_options, 0)) { status = -EINVAL; - goto read_super_error; + goto out; } /* probe for superblock */ status = ocfs2_sb_probe(sb, &bh, §or_size, &stats); if (status < 0) { mlog(ML_ERROR, "superblock probe failed!\n"); - goto read_super_error; + goto out; } status = ocfs2_initialize_super(sb, bh, sector_size, &stats); - osb = OCFS2_SB(sb); - if (status < 0) { - mlog_errno(status); - goto read_super_error; - } brelse(bh); bh = NULL; + if (status < 0) + goto out; + + osb = OCFS2_SB(sb); if (!ocfs2_check_set_options(sb, &parsed_options)) { status = -EINVAL; - goto read_super_error; + goto out_super; } osb->s_mount_opt = parsed_options.mount_opt; osb->s_atime_quantum = parsed_options.atime_quantum; @@ -1027,7 +1026,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) status = ocfs2_verify_userspace_stack(osb, &parsed_options); if (status) - goto read_super_error; + goto out_super; sb->s_magic = OCFS2_SUPER_MAGIC; @@ -1041,7 +1040,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) status = -EACCES; mlog(ML_ERROR, "Readonly device detected but readonly " "mount was not specified.\n"); - goto read_super_error; + goto out_super; } /* You should not be able to start a local heartbeat @@ -1050,7 +1049,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) status = -EROFS; mlog(ML_ERROR, "Local heartbeat specified on readonly " "device.\n"); - goto read_super_error; + goto out_super; } status = ocfs2_check_journals_nolocks(osb); @@ -1059,9 +1058,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) mlog(ML_ERROR, "Recovery required on readonly " "file system, but write access is " "unavailable.\n"); - else - mlog_errno(status); - goto read_super_error; + goto out_super; } ocfs2_set_ro_flag(osb, 1); @@ -1077,10 +1074,8 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) } status = ocfs2_verify_heartbeat(osb); - if (status < 0) { - mlog_errno(status); - goto read_super_error; - } + if (status < 0) + goto out_super; osb->osb_debug_root = debugfs_create_dir(osb->uuid_str, ocfs2_debugfs_root); @@ -1094,15 +1089,14 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) status = ocfs2_mount_volume(sb); if (status < 0) - goto read_super_error; + goto out_debugfs; if (osb->root_inode) inode = igrab(osb->root_inode); if (!inode) { status = -EIO; - mlog_errno(status); - goto read_super_error; + goto out_dismount; } osb->osb_dev_kset = kset_create_and_add(sb->s_id, NULL, @@ -1110,7 +1104,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) if (!osb->osb_dev_kset) { status = -ENOMEM; mlog(ML_ERROR, "Unable to create device kset %s.\n", sb->s_id); - goto read_super_error; + goto out_dismount; } /* Create filecheck sysfs related directories/files at @@ -1119,14 +1113,13 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) status = -ENOMEM; mlog(ML_ERROR, "Unable to create filecheck sysfs directory at " "/sys/fs/ocfs2/%s/filecheck.\n", sb->s_id); - goto read_super_error; + goto out_dismount; } root = d_make_root(inode); if (!root) { status = -ENOMEM; - mlog_errno(status); - goto read_super_error; + goto out_dismount; } sb->s_root = root; @@ -1178,17 +1171,21 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) return status; -read_super_error: - brelse(bh); +out_dismount: + atomic_set(&osb->vol_state, VOLUME_DISABLED); + wake_up(&osb->osb_mount_event); + ocfs2_dismount_volume(sb, 1); + goto out; - if (status) - mlog_errno(status); - - if (osb) { - atomic_set(&osb->vol_state, VOLUME_DISABLED); - wake_up(&osb->osb_mount_event); - ocfs2_dismount_volume(sb, 1); - } +out_debugfs: + debugfs_remove_recursive(osb->osb_debug_root); +out_super: + ocfs2_release_system_inodes(osb); + kfree(osb->recovery_map); + ocfs2_delete_osb(osb); + kfree(osb); +out: + mlog_errno(status); return status; } From 04d168c6d42d1772d35372301a14bb20784c81c5 Mon Sep 17 00:00:00 2001 From: Jakob Koschel Date: Fri, 29 Apr 2022 14:37:59 -0700 Subject: [PATCH 13/65] fs/proc/kcore.c: remove check of list iterator against head past the loop body When list_for_each_entry() completes the iteration over the whole list without breaking the loop, the iterator value will be a bogus pointer computed based on the head element. While it is safe to use the pointer to determine if it was computed based on the head element, either with list_entry_is_head() or &pos->member == head, using the iterator variable after the loop should be avoided. In preparation to limit the scope of a list iterator to the list traversal loop, use a dedicated pointer to point to the found element [1]. [akpm@linux-foundation.org: reduce scope of `iter'] Link: https://lore.kernel.org/all/CAHk-=wgRr_D8CB-D9Kg-c=EHreAsk5SqXPwr9Y7k9sA6cWXJ6w@mail.gmail.com/ [1] Link: https://lkml.kernel.org/r/20220331223700.902556-1-jakobkoschel@gmail.com Signed-off-by: Jakob Koschel Cc: Mike Rapoport Cc: David Hildenbrand Cc: Oscar Salvador Cc: "Brian Johannesmeyer" Cc: Cristiano Giuffrida Cc: "Bos, H.J." Cc: Alexey Dobriyan Signed-off-by: Andrew Morton --- fs/proc/kcore.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 982e694aae77..dff921f7ca33 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -479,10 +479,15 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) * the previous entry, search for a matching entry. */ if (!m || start < m->addr || start >= m->addr + m->size) { - list_for_each_entry(m, &kclist_head, list) { - if (start >= m->addr && - start < m->addr + m->size) + struct kcore_list *iter; + + m = NULL; + list_for_each_entry(iter, &kclist_head, list) { + if (start >= iter->addr && + start < iter->addr + iter->size) { + m = iter; break; + } } } @@ -492,12 +497,11 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) page_offline_freeze(); } - if (&m->list == &kclist_head) { + if (!m) { if (clear_user(buffer, tsz)) { ret = -EFAULT; goto out; } - m = NULL; /* skip the list anchor */ goto skip; } From 5d8de293c224896a4da99763fce4f9794308caf4 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 29 Apr 2022 14:37:59 -0700 Subject: [PATCH 14/65] vmcore: convert copy_oldmem_page() to take an iov_iter Patch series "Convert vmcore to use an iov_iter", v5. For some reason several people have been sending bad patches to fix compiler warnings in vmcore recently. Here's how it should be done. Compile-tested only on x86. As noted in the first patch, s390 should take this conversion a bit further, but I'm not inclined to do that work myself. This patch (of 3): Instead of passing in a 'buf' and 'userbuf' argument, pass in an iov_iter. s390 needs more work to pass the iov_iter down further, or refactor, but I'd be more comfortable if someone who can test on s390 did that work. It's more convenient to convert the whole of read_from_oldmem() to take an iov_iter at the same time, so rename it to read_from_oldmem_iter() and add a temporary read_from_oldmem() wrapper that creates an iov_iter. Link: https://lkml.kernel.org/r/20220408090636.560886-1-bhe@redhat.com Link: https://lkml.kernel.org/r/20220408090636.560886-2-bhe@redhat.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Baoquan He Reviewed-by: Christoph Hellwig Cc: Heiko Carstens Signed-off-by: Andrew Morton --- arch/arm/kernel/crash_dump.c | 27 +++------------- arch/arm64/kernel/crash_dump.c | 29 +++-------------- arch/ia64/kernel/crash_dump.c | 32 +++---------------- arch/mips/kernel/crash_dump.c | 27 +++------------- arch/powerpc/kernel/crash_dump.c | 35 +++------------------ arch/riscv/kernel/crash_dump.c | 26 +++------------ arch/s390/kernel/crash_dump.c | 13 +++++--- arch/sh/kernel/crash_dump.c | 29 +++-------------- arch/x86/kernel/crash_dump_32.c | 29 +++-------------- arch/x86/kernel/crash_dump_64.c | 41 +++++++----------------- fs/proc/vmcore.c | 54 ++++++++++++++++++++------------ include/linux/crash_dump.h | 9 +++--- 12 files changed, 91 insertions(+), 260 deletions(-) diff --git a/arch/arm/kernel/crash_dump.c b/arch/arm/kernel/crash_dump.c index 53cb92435392..938bd932df9a 100644 --- a/arch/arm/kernel/crash_dump.c +++ b/arch/arm/kernel/crash_dump.c @@ -14,22 +14,10 @@ #include #include #include +#include -/** - * copy_oldmem_page() - copy one page from old kernel memory - * @pfn: page frame number to be copied - * @buf: buffer where the copied page is placed - * @csize: number of bytes to copy - * @offset: offset in bytes into the page - * @userbuf: if set, @buf is int he user address space - * - * This function copies one page from old kernel memory into buffer pointed by - * @buf. If @buf is in userspace, set @userbuf to %1. Returns number of bytes - * copied or negative error in case of failure. - */ -ssize_t copy_oldmem_page(unsigned long pfn, char *buf, - size_t csize, unsigned long offset, - int userbuf) +ssize_t copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, + size_t csize, unsigned long offset) { void *vaddr; @@ -40,14 +28,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, if (!vaddr) return -ENOMEM; - if (userbuf) { - if (copy_to_user(buf, vaddr + offset, csize)) { - iounmap(vaddr); - return -EFAULT; - } - } else { - memcpy(buf, vaddr + offset, csize); - } + csize = copy_to_iter(vaddr + offset, csize, iter); iounmap(vaddr); return csize; diff --git a/arch/arm64/kernel/crash_dump.c b/arch/arm64/kernel/crash_dump.c index 58303a9ec32c..670e4ce81822 100644 --- a/arch/arm64/kernel/crash_dump.c +++ b/arch/arm64/kernel/crash_dump.c @@ -9,25 +9,11 @@ #include #include #include -#include -#include +#include #include -/** - * copy_oldmem_page() - copy one page from old kernel memory - * @pfn: page frame number to be copied - * @buf: buffer where the copied page is placed - * @csize: number of bytes to copy - * @offset: offset in bytes into the page - * @userbuf: if set, @buf is in a user address space - * - * This function copies one page from old kernel memory into buffer pointed by - * @buf. If @buf is in userspace, set @userbuf to %1. Returns number of bytes - * copied or negative error in case of failure. - */ -ssize_t copy_oldmem_page(unsigned long pfn, char *buf, - size_t csize, unsigned long offset, - int userbuf) +ssize_t copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, + size_t csize, unsigned long offset) { void *vaddr; @@ -38,14 +24,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, if (!vaddr) return -ENOMEM; - if (userbuf) { - if (copy_to_user((char __user *)buf, vaddr + offset, csize)) { - memunmap(vaddr); - return -EFAULT; - } - } else { - memcpy(buf, vaddr + offset, csize); - } + csize = copy_to_iter(vaddr + offset, csize, iter); memunmap(vaddr); diff --git a/arch/ia64/kernel/crash_dump.c b/arch/ia64/kernel/crash_dump.c index 0ed3c3dee4cd..4ef68e2aa757 100644 --- a/arch/ia64/kernel/crash_dump.c +++ b/arch/ia64/kernel/crash_dump.c @@ -10,42 +10,18 @@ #include #include #include - +#include #include -#include -/** - * copy_oldmem_page - copy one page from "oldmem" - * @pfn: page frame number to be copied - * @buf: target memory address for the copy; this can be in kernel address - * space or user address space (see @userbuf) - * @csize: number of bytes to copy - * @offset: offset in bytes into the page (based on pfn) to begin the copy - * @userbuf: if set, @buf is in user address space, use copy_to_user(), - * otherwise @buf is in kernel address space, use memcpy(). - * - * Copy a page from "oldmem". For this page, there is no pte mapped - * in the current kernel. We stitch up a pte, similar to kmap_atomic. - * - * Calling copy_to_user() in atomic context is not desirable. Hence first - * copying the data to a pre-allocated kernel page and then copying to user - * space in non-atomic context. - */ -ssize_t -copy_oldmem_page(unsigned long pfn, char *buf, - size_t csize, unsigned long offset, int userbuf) +ssize_t copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, + size_t csize, unsigned long offset) { void *vaddr; if (!csize) return 0; vaddr = __va(pfn< #include +#include -/** - * copy_oldmem_page - copy one page from "oldmem" - * @pfn: page frame number to be copied - * @buf: target memory address for the copy; this can be in kernel address - * space or user address space (see @userbuf) - * @csize: number of bytes to copy - * @offset: offset in bytes into the page (based on pfn) to begin the copy - * @userbuf: if set, @buf is in user address space, use copy_to_user(), - * otherwise @buf is in kernel address space, use memcpy(). - * - * Copy a page from "oldmem". For this page, there is no pte mapped - * in the current kernel. - */ -ssize_t copy_oldmem_page(unsigned long pfn, char *buf, - size_t csize, unsigned long offset, int userbuf) +ssize_t copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, + size_t csize, unsigned long offset) { void *vaddr; @@ -24,14 +12,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, return 0; vaddr = kmap_local_pfn(pfn); - - if (!userbuf) { - memcpy(buf, vaddr + offset, csize); - } else { - if (copy_to_user(buf, vaddr + offset, csize)) - csize = -EFAULT; - } - + csize = copy_to_iter(vaddr + offset, csize, iter); kunmap_local(vaddr); return csize; diff --git a/arch/powerpc/kernel/crash_dump.c b/arch/powerpc/kernel/crash_dump.c index 5693e1c67c2b..32b4a97f1b79 100644 --- a/arch/powerpc/kernel/crash_dump.c +++ b/arch/powerpc/kernel/crash_dump.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include @@ -68,33 +68,8 @@ void __init setup_kdump_trampoline(void) } #endif /* CONFIG_NONSTATIC_KERNEL */ -static size_t copy_oldmem_vaddr(void *vaddr, char *buf, size_t csize, - unsigned long offset, int userbuf) -{ - if (userbuf) { - if (copy_to_user((char __user *)buf, (vaddr + offset), csize)) - return -EFAULT; - } else - memcpy(buf, (vaddr + offset), csize); - - return csize; -} - -/** - * copy_oldmem_page - copy one page from "oldmem" - * @pfn: page frame number to be copied - * @buf: target memory address for the copy; this can be in kernel address - * space or user address space (see @userbuf) - * @csize: number of bytes to copy - * @offset: offset in bytes into the page (based on pfn) to begin the copy - * @userbuf: if set, @buf is in user address space, use copy_to_user(), - * otherwise @buf is in kernel address space, use memcpy(). - * - * Copy a page from "oldmem". For this page, there is no pte mapped - * in the current kernel. We stitch up a pte, similar to kmap_atomic. - */ -ssize_t copy_oldmem_page(unsigned long pfn, char *buf, - size_t csize, unsigned long offset, int userbuf) +ssize_t copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, + size_t csize, unsigned long offset) { void *vaddr; phys_addr_t paddr; @@ -107,10 +82,10 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, if (memblock_is_region_memory(paddr, csize)) { vaddr = __va(paddr); - csize = copy_oldmem_vaddr(vaddr, buf, csize, offset, userbuf); + csize = copy_to_iter(vaddr + offset, csize, iter); } else { vaddr = ioremap_cache(paddr, PAGE_SIZE); - csize = copy_oldmem_vaddr(vaddr, buf, csize, offset, userbuf); + csize = copy_to_iter(vaddr + offset, csize, iter); iounmap(vaddr); } diff --git a/arch/riscv/kernel/crash_dump.c b/arch/riscv/kernel/crash_dump.c index 86cc0ada5752..ea2158cee97b 100644 --- a/arch/riscv/kernel/crash_dump.c +++ b/arch/riscv/kernel/crash_dump.c @@ -7,22 +7,10 @@ #include #include +#include -/** - * copy_oldmem_page() - copy one page from old kernel memory - * @pfn: page frame number to be copied - * @buf: buffer where the copied page is placed - * @csize: number of bytes to copy - * @offset: offset in bytes into the page - * @userbuf: if set, @buf is in a user address space - * - * This function copies one page from old kernel memory into buffer pointed by - * @buf. If @buf is in userspace, set @userbuf to %1. Returns number of bytes - * copied or negative error in case of failure. - */ -ssize_t copy_oldmem_page(unsigned long pfn, char *buf, - size_t csize, unsigned long offset, - int userbuf) +ssize_t copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, + size_t csize, unsigned long offset) { void *vaddr; @@ -33,13 +21,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, if (!vaddr) return -ENOMEM; - if (userbuf) { - if (copy_to_user((char __user *)buf, vaddr + offset, csize)) { - memunmap(vaddr); - return -EFAULT; - } - } else - memcpy(buf, vaddr + offset, csize); + csize = copy_to_iter(vaddr + offset, csize, iter); memunmap(vaddr); return csize; diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c index 69819b765250..a2c1c55daec0 100644 --- a/arch/s390/kernel/crash_dump.c +++ b/arch/s390/kernel/crash_dump.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -212,8 +213,8 @@ static int copy_oldmem_user(void __user *dst, unsigned long src, size_t count) /* * Copy one page from "oldmem" */ -ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, - unsigned long offset, int userbuf) +ssize_t copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, size_t csize, + unsigned long offset) { unsigned long src; int rc; @@ -221,10 +222,12 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, if (!csize) return 0; src = pfn_to_phys(pfn) + offset; - if (userbuf) - rc = copy_oldmem_user((void __force __user *) buf, src, csize); + + /* XXX: pass the iov_iter down to a common function */ + if (iter_is_iovec(iter)) + rc = copy_oldmem_user(iter->iov->iov_base, src, csize); else - rc = copy_oldmem_kernel((void *) buf, src, csize); + rc = copy_oldmem_kernel(iter->kvec->iov_base, src, csize); return rc; } diff --git a/arch/sh/kernel/crash_dump.c b/arch/sh/kernel/crash_dump.c index 5b41b59698c1..19ce6a950aac 100644 --- a/arch/sh/kernel/crash_dump.c +++ b/arch/sh/kernel/crash_dump.c @@ -8,23 +8,11 @@ #include #include #include +#include #include -/** - * copy_oldmem_page - copy one page from "oldmem" - * @pfn: page frame number to be copied - * @buf: target memory address for the copy; this can be in kernel address - * space or user address space (see @userbuf) - * @csize: number of bytes to copy - * @offset: offset in bytes into the page (based on pfn) to begin the copy - * @userbuf: if set, @buf is in user address space, use copy_to_user(), - * otherwise @buf is in kernel address space, use memcpy(). - * - * Copy a page from "oldmem". For this page, there is no pte mapped - * in the current kernel. We stitch up a pte, similar to kmap_atomic. - */ -ssize_t copy_oldmem_page(unsigned long pfn, char *buf, - size_t csize, unsigned long offset, int userbuf) +ssize_t copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, + size_t csize, unsigned long offset) { void __iomem *vaddr; @@ -32,15 +20,8 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, return 0; vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE); - - if (userbuf) { - if (copy_to_user((void __user *)buf, (vaddr + offset), csize)) { - iounmap(vaddr); - return -EFAULT; - } - } else - memcpy(buf, (vaddr + offset), csize); - + csize = copy_to_iter(vaddr + offset, csize, iter); iounmap(vaddr); + return csize; } diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c index 5fcac46aaf6b..5f4ae5476e19 100644 --- a/arch/x86/kernel/crash_dump_32.c +++ b/arch/x86/kernel/crash_dump_32.c @@ -10,8 +10,7 @@ #include #include #include - -#include +#include static inline bool is_crashed_pfn_valid(unsigned long pfn) { @@ -29,21 +28,8 @@ static inline bool is_crashed_pfn_valid(unsigned long pfn) #endif } -/** - * copy_oldmem_page - copy one page from "oldmem" - * @pfn: page frame number to be copied - * @buf: target memory address for the copy; this can be in kernel address - * space or user address space (see @userbuf) - * @csize: number of bytes to copy - * @offset: offset in bytes into the page (based on pfn) to begin the copy - * @userbuf: if set, @buf is in user address space, use copy_to_user(), - * otherwise @buf is in kernel address space, use memcpy(). - * - * Copy a page from "oldmem". For this page, there might be no pte mapped - * in the current kernel. - */ -ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, - unsigned long offset, int userbuf) +ssize_t copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, size_t csize, + unsigned long offset) { void *vaddr; @@ -54,14 +40,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, return -EFAULT; vaddr = kmap_local_pfn(pfn); - - if (!userbuf) { - memcpy(buf, vaddr + offset, csize); - } else { - if (copy_to_user(buf, vaddr + offset, csize)) - csize = -EFAULT; - } - + csize = copy_to_iter(vaddr + offset, csize, iter); kunmap_local(vaddr); return csize; diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c index 97529552dd24..94fe4aff9694 100644 --- a/arch/x86/kernel/crash_dump_64.c +++ b/arch/x86/kernel/crash_dump_64.c @@ -8,12 +8,12 @@ #include #include -#include +#include #include #include -static ssize_t __copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, - unsigned long offset, int userbuf, +static ssize_t __copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, + size_t csize, unsigned long offset, bool encrypted) { void *vaddr; @@ -29,46 +29,27 @@ static ssize_t __copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, if (!vaddr) return -ENOMEM; - if (userbuf) { - if (copy_to_user((void __user *)buf, vaddr + offset, csize)) { - iounmap((void __iomem *)vaddr); - return -EFAULT; - } - } else - memcpy(buf, vaddr + offset, csize); + csize = copy_to_iter(vaddr + offset, csize, iter); iounmap((void __iomem *)vaddr); return csize; } -/** - * copy_oldmem_page - copy one page of memory - * @pfn: page frame number to be copied - * @buf: target memory address for the copy; this can be in kernel address - * space or user address space (see @userbuf) - * @csize: number of bytes to copy - * @offset: offset in bytes into the page (based on pfn) to begin the copy - * @userbuf: if set, @buf is in user address space, use copy_to_user(), - * otherwise @buf is in kernel address space, use memcpy(). - * - * Copy a page from the old kernel's memory. For this page, there is no pte - * mapped in the current kernel. We stitch up a pte, similar to kmap_atomic. - */ -ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, - unsigned long offset, int userbuf) +ssize_t copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, size_t csize, + unsigned long offset) { - return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, false); + return __copy_oldmem_page(iter, pfn, csize, offset, false); } -/** +/* * copy_oldmem_page_encrypted - same as copy_oldmem_page() above but ioremap the * memory with the encryption mask set to accommodate kdump on SME-enabled * machines. */ -ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize, - unsigned long offset, int userbuf) +ssize_t copy_oldmem_page_encrypted(struct iov_iter *iter, unsigned long pfn, + size_t csize, unsigned long offset) { - return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, true); + return __copy_oldmem_page(iter, pfn, csize, offset, true); } ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos) diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 6f1b8ddc6f7a..54dda2e19ed1 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include "internal.h" @@ -128,9 +129,8 @@ static int open_vmcore(struct inode *inode, struct file *file) } /* Reads a page from the oldmem device from given offset. */ -ssize_t read_from_oldmem(char *buf, size_t count, - u64 *ppos, int userbuf, - bool encrypted) +static ssize_t read_from_oldmem_iter(struct iov_iter *iter, size_t count, + u64 *ppos, bool encrypted) { unsigned long pfn, offset; size_t nr_bytes; @@ -152,29 +152,23 @@ ssize_t read_from_oldmem(char *buf, size_t count, /* If pfn is not ram, return zeros for sparse dump files */ if (!pfn_is_ram(pfn)) { - tmp = 0; - if (!userbuf) - memset(buf, 0, nr_bytes); - else if (clear_user(buf, nr_bytes)) - tmp = -EFAULT; + tmp = iov_iter_zero(nr_bytes, iter); } else { if (encrypted) - tmp = copy_oldmem_page_encrypted(pfn, buf, + tmp = copy_oldmem_page_encrypted(iter, pfn, nr_bytes, - offset, - userbuf); + offset); else - tmp = copy_oldmem_page(pfn, buf, nr_bytes, - offset, userbuf); + tmp = copy_oldmem_page(iter, pfn, nr_bytes, + offset); } - if (tmp < 0) { + if (tmp < nr_bytes) { srcu_read_unlock(&vmcore_cb_srcu, idx); - return tmp; + return -EFAULT; } *ppos += nr_bytes; count -= nr_bytes; - buf += nr_bytes; read += nr_bytes; ++pfn; offset = 0; @@ -184,6 +178,27 @@ ssize_t read_from_oldmem(char *buf, size_t count, return read; } +ssize_t read_from_oldmem(char *buf, size_t count, + u64 *ppos, int userbuf, + bool encrypted) +{ + struct iov_iter iter; + struct iovec iov; + struct kvec kvec; + + if (userbuf) { + iov.iov_base = (__force void __user *)buf; + iov.iov_len = count; + iov_iter_init(&iter, READ, &iov, 1, count); + } else { + kvec.iov_base = buf; + kvec.iov_len = count; + iov_iter_kvec(&iter, READ, &kvec, 1, count); + } + + return read_from_oldmem_iter(&iter, count, ppos, encrypted); +} + /* * Architectures may override this function to allocate ELF header in 2nd kernel */ @@ -228,11 +243,10 @@ int __weak remap_oldmem_pfn_range(struct vm_area_struct *vma, /* * Architectures which support memory encryption override this. */ -ssize_t __weak -copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize, - unsigned long offset, int userbuf) +ssize_t __weak copy_oldmem_page_encrypted(struct iov_iter *iter, + unsigned long pfn, size_t csize, unsigned long offset) { - return copy_oldmem_page(pfn, buf, csize, offset, userbuf); + return copy_oldmem_page(iter, pfn, csize, offset); } /* diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h index 620821549b23..a1cf7d5c03c7 100644 --- a/include/linux/crash_dump.h +++ b/include/linux/crash_dump.h @@ -24,11 +24,10 @@ extern int remap_oldmem_pfn_range(struct vm_area_struct *vma, unsigned long from, unsigned long pfn, unsigned long size, pgprot_t prot); -extern ssize_t copy_oldmem_page(unsigned long, char *, size_t, - unsigned long, int); -extern ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf, - size_t csize, unsigned long offset, - int userbuf); +ssize_t copy_oldmem_page(struct iov_iter *i, unsigned long pfn, size_t csize, + unsigned long offset); +ssize_t copy_oldmem_page_encrypted(struct iov_iter *iter, unsigned long pfn, + size_t csize, unsigned long offset); void vmcore_cleanup(void); From 4a22fd20379ca897a6bfdb8372b4f9601e430332 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 29 Apr 2022 14:37:59 -0700 Subject: [PATCH 15/65] vmcore: convert __read_vmcore to use an iov_iter This gets rid of copy_to() and let us use proc_read_iter() instead of proc_read(). Link: https://lkml.kernel.org/r/20220408090636.560886-3-bhe@redhat.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Baoquan He Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- fs/proc/vmcore.c | 82 ++++++++++++++++++------------------------------ 1 file changed, 30 insertions(+), 52 deletions(-) diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 54dda2e19ed1..4a721865b5cd 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -249,22 +249,8 @@ ssize_t __weak copy_oldmem_page_encrypted(struct iov_iter *iter, return copy_oldmem_page(iter, pfn, csize, offset); } -/* - * Copy to either kernel or user space - */ -static int copy_to(void *target, void *src, size_t size, int userbuf) -{ - if (userbuf) { - if (copy_to_user((char __user *) target, src, size)) - return -EFAULT; - } else { - memcpy(target, src, size); - } - return 0; -} - #ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP -static int vmcoredd_copy_dumps(void *dst, u64 start, size_t size, int userbuf) +static int vmcoredd_copy_dumps(struct iov_iter *iter, u64 start, size_t size) { struct vmcoredd_node *dump; u64 offset = 0; @@ -277,14 +263,13 @@ static int vmcoredd_copy_dumps(void *dst, u64 start, size_t size, int userbuf) if (start < offset + dump->size) { tsz = min(offset + (u64)dump->size - start, (u64)size); buf = dump->buf + start - offset; - if (copy_to(dst, buf, tsz, userbuf)) { + if (copy_to_iter(buf, tsz, iter) < tsz) { ret = -EFAULT; goto out_unlock; } size -= tsz; start += tsz; - dst += tsz; /* Leave now if buffer filled already */ if (!size) @@ -340,33 +325,28 @@ out_unlock: /* Read from the ELF header and then the crash dump. On error, negative value is * returned otherwise number of bytes read are returned. */ -static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos, - int userbuf) +static ssize_t __read_vmcore(struct iov_iter *iter, loff_t *fpos) { ssize_t acc = 0, tmp; size_t tsz; u64 start; struct vmcore *m = NULL; - if (buflen == 0 || *fpos >= vmcore_size) + if (!iov_iter_count(iter) || *fpos >= vmcore_size) return 0; - /* trim buflen to not go beyond EOF */ - if (buflen > vmcore_size - *fpos) - buflen = vmcore_size - *fpos; + iov_iter_truncate(iter, vmcore_size - *fpos); /* Read ELF core header */ if (*fpos < elfcorebuf_sz) { - tsz = min(elfcorebuf_sz - (size_t)*fpos, buflen); - if (copy_to(buffer, elfcorebuf + *fpos, tsz, userbuf)) + tsz = min(elfcorebuf_sz - (size_t)*fpos, iov_iter_count(iter)); + if (copy_to_iter(elfcorebuf + *fpos, tsz, iter) < tsz) return -EFAULT; - buflen -= tsz; *fpos += tsz; - buffer += tsz; acc += tsz; /* leave now if filled buffer already */ - if (buflen == 0) + if (!iov_iter_count(iter)) return acc; } @@ -387,35 +367,32 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos, /* Read device dumps */ if (*fpos < elfcorebuf_sz + vmcoredd_orig_sz) { tsz = min(elfcorebuf_sz + vmcoredd_orig_sz - - (size_t)*fpos, buflen); + (size_t)*fpos, iov_iter_count(iter)); start = *fpos - elfcorebuf_sz; - if (vmcoredd_copy_dumps(buffer, start, tsz, userbuf)) + if (vmcoredd_copy_dumps(iter, start, tsz)) return -EFAULT; - buflen -= tsz; *fpos += tsz; - buffer += tsz; acc += tsz; /* leave now if filled buffer already */ - if (!buflen) + if (!iov_iter_count(iter)) return acc; } #endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ /* Read remaining elf notes */ - tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)*fpos, buflen); + tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)*fpos, + iov_iter_count(iter)); kaddr = elfnotes_buf + *fpos - elfcorebuf_sz - vmcoredd_orig_sz; - if (copy_to(buffer, kaddr, tsz, userbuf)) + if (copy_to_iter(kaddr, tsz, iter) < tsz) return -EFAULT; - buflen -= tsz; *fpos += tsz; - buffer += tsz; acc += tsz; /* leave now if filled buffer already */ - if (buflen == 0) + if (!iov_iter_count(iter)) return acc; } @@ -423,19 +400,17 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos, if (*fpos < m->offset + m->size) { tsz = (size_t)min_t(unsigned long long, m->offset + m->size - *fpos, - buflen); + iov_iter_count(iter)); start = m->paddr + *fpos - m->offset; - tmp = read_from_oldmem(buffer, tsz, &start, - userbuf, cc_platform_has(CC_ATTR_MEM_ENCRYPT)); + tmp = read_from_oldmem_iter(iter, tsz, &start, + cc_platform_has(CC_ATTR_MEM_ENCRYPT)); if (tmp < 0) return tmp; - buflen -= tsz; *fpos += tsz; - buffer += tsz; acc += tsz; /* leave now if filled buffer already */ - if (buflen == 0) + if (!iov_iter_count(iter)) return acc; } } @@ -443,15 +418,14 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos, return acc; } -static ssize_t read_vmcore(struct file *file, char __user *buffer, - size_t buflen, loff_t *fpos) +static ssize_t read_vmcore(struct kiocb *iocb, struct iov_iter *iter) { - return __read_vmcore((__force char *) buffer, buflen, fpos, 1); + return __read_vmcore(iter, &iocb->ki_pos); } /* * The vmcore fault handler uses the page cache and fills data using the - * standard __vmcore_read() function. + * standard __read_vmcore() function. * * On s390 the fault handler is used for memory regions that can't be mapped * directly with remap_pfn_range(). @@ -461,9 +435,10 @@ static vm_fault_t mmap_vmcore_fault(struct vm_fault *vmf) #ifdef CONFIG_S390 struct address_space *mapping = vmf->vma->vm_file->f_mapping; pgoff_t index = vmf->pgoff; + struct iov_iter iter; + struct kvec kvec; struct page *page; loff_t offset; - char *buf; int rc; page = find_or_create_page(mapping, index, GFP_KERNEL); @@ -471,8 +446,11 @@ static vm_fault_t mmap_vmcore_fault(struct vm_fault *vmf) return VM_FAULT_OOM; if (!PageUptodate(page)) { offset = (loff_t) index << PAGE_SHIFT; - buf = __va((page_to_pfn(page) << PAGE_SHIFT)); - rc = __read_vmcore(buf, PAGE_SIZE, &offset, 0); + kvec.iov_base = page_address(page); + kvec.iov_len = PAGE_SIZE; + iov_iter_kvec(&iter, READ, &kvec, 1, PAGE_SIZE); + + rc = __read_vmcore(&iter, &offset); if (rc < 0) { unlock_page(page); put_page(page); @@ -722,7 +700,7 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) static const struct proc_ops vmcore_proc_ops = { .proc_open = open_vmcore, - .proc_read = read_vmcore, + .proc_read_iter = read_vmcore, .proc_lseek = default_llseek, .proc_mmap = mmap_vmcore, }; From e0690479917cbce740eef51fa3de92c69647a5ad Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 29 Apr 2022 14:37:59 -0700 Subject: [PATCH 16/65] vmcore: convert read_from_oldmem() to take an iov_iter Remove the read_from_oldmem() wrapper introduced earlier and convert all the remaining callers to pass an iov_iter. Link: https://lkml.kernel.org/r/20220408090636.560886-4-bhe@redhat.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Baoquan He Reviewed-by: Christoph Hellwig Cc: Tiezhu Yang Cc: Amit Daniel Kachhap Cc: Al Viro Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- arch/x86/kernel/crash_dump_64.c | 7 +++++- fs/proc/vmcore.c | 40 +++++++++++++-------------------- include/linux/crash_dump.h | 10 ++++----- 3 files changed, 25 insertions(+), 32 deletions(-) diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c index 94fe4aff9694..e75bc2f217ff 100644 --- a/arch/x86/kernel/crash_dump_64.c +++ b/arch/x86/kernel/crash_dump_64.c @@ -54,6 +54,11 @@ ssize_t copy_oldmem_page_encrypted(struct iov_iter *iter, unsigned long pfn, ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos) { - return read_from_oldmem(buf, count, ppos, 0, + struct kvec kvec = { .iov_base = buf, .iov_len = count }; + struct iov_iter iter; + + iov_iter_kvec(&iter, READ, &kvec, 1, count); + + return read_from_oldmem(&iter, count, ppos, cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)); } diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 4a721865b5cd..4eaeb645e759 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -129,7 +129,7 @@ static int open_vmcore(struct inode *inode, struct file *file) } /* Reads a page from the oldmem device from given offset. */ -static ssize_t read_from_oldmem_iter(struct iov_iter *iter, size_t count, +ssize_t read_from_oldmem(struct iov_iter *iter, size_t count, u64 *ppos, bool encrypted) { unsigned long pfn, offset; @@ -178,27 +178,6 @@ static ssize_t read_from_oldmem_iter(struct iov_iter *iter, size_t count, return read; } -ssize_t read_from_oldmem(char *buf, size_t count, - u64 *ppos, int userbuf, - bool encrypted) -{ - struct iov_iter iter; - struct iovec iov; - struct kvec kvec; - - if (userbuf) { - iov.iov_base = (__force void __user *)buf; - iov.iov_len = count; - iov_iter_init(&iter, READ, &iov, 1, count); - } else { - kvec.iov_base = buf; - kvec.iov_len = count; - iov_iter_kvec(&iter, READ, &kvec, 1, count); - } - - return read_from_oldmem_iter(&iter, count, ppos, encrypted); -} - /* * Architectures may override this function to allocate ELF header in 2nd kernel */ @@ -218,7 +197,12 @@ void __weak elfcorehdr_free(unsigned long long addr) */ ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos) { - return read_from_oldmem(buf, count, ppos, 0, false); + struct kvec kvec = { .iov_base = buf, .iov_len = count }; + struct iov_iter iter; + + iov_iter_kvec(&iter, READ, &kvec, 1, count); + + return read_from_oldmem(&iter, count, ppos, false); } /* @@ -226,7 +210,13 @@ ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos) */ ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos) { - return read_from_oldmem(buf, count, ppos, 0, cc_platform_has(CC_ATTR_MEM_ENCRYPT)); + struct kvec kvec = { .iov_base = buf, .iov_len = count }; + struct iov_iter iter; + + iov_iter_kvec(&iter, READ, &kvec, 1, count); + + return read_from_oldmem(&iter, count, ppos, + cc_platform_has(CC_ATTR_MEM_ENCRYPT)); } /* @@ -402,7 +392,7 @@ static ssize_t __read_vmcore(struct iov_iter *iter, loff_t *fpos) m->offset + m->size - *fpos, iov_iter_count(iter)); start = m->paddr + *fpos - m->offset; - tmp = read_from_oldmem_iter(iter, tsz, &start, + tmp = read_from_oldmem(iter, tsz, &start, cc_platform_has(CC_ATTR_MEM_ENCRYPT)); if (tmp < 0) return tmp; diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h index a1cf7d5c03c7..0f3a656293b0 100644 --- a/include/linux/crash_dump.h +++ b/include/linux/crash_dump.h @@ -134,13 +134,11 @@ static inline int vmcore_add_device_dump(struct vmcoredd_data *data) #endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ #ifdef CONFIG_PROC_VMCORE -ssize_t read_from_oldmem(char *buf, size_t count, - u64 *ppos, int userbuf, - bool encrypted); +ssize_t read_from_oldmem(struct iov_iter *iter, size_t count, + u64 *ppos, bool encrypted); #else -static inline ssize_t read_from_oldmem(char *buf, size_t count, - u64 *ppos, int userbuf, - bool encrypted) +static inline ssize_t read_from_oldmem(struct iov_iter *iter, size_t count, + u64 *ppos, bool encrypted) { return -EOPNOTSUPP; } From 6308499b5e99c0c903fde2c605e41d9a86c4be6c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 29 Apr 2022 14:37:59 -0700 Subject: [PATCH 17/65] net: unexport csum_and_copy_{from,to}_user csum_and_copy_from_user and csum_and_copy_to_user are exported by a few architectures, but not actually used in modular code. Drop the exports. Link: https://lkml.kernel.org/r/20220421070440.1282704-1-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Jakub Kicinski Acked-by: Geert Uytterhoeven Acked-by: Arnd Bergmann Acked-by: Michael Ellerman (powerpc) Cc: David Miller Signed-off-by: Andrew Morton --- arch/alpha/lib/csum_partial_copy.c | 1 - arch/m68k/lib/checksum.c | 2 -- arch/powerpc/lib/checksum_wrappers.c | 2 -- arch/x86/lib/csum-wrappers_64.c | 2 -- 4 files changed, 7 deletions(-) diff --git a/arch/alpha/lib/csum_partial_copy.c b/arch/alpha/lib/csum_partial_copy.c index 1931a04af85a..4d180d96f09e 100644 --- a/arch/alpha/lib/csum_partial_copy.c +++ b/arch/alpha/lib/csum_partial_copy.c @@ -353,7 +353,6 @@ csum_and_copy_from_user(const void __user *src, void *dst, int len) return 0; return __csum_and_copy(src, dst, len); } -EXPORT_SYMBOL(csum_and_copy_from_user); __wsum csum_partial_copy_nocheck(const void *src, void *dst, int len) diff --git a/arch/m68k/lib/checksum.c b/arch/m68k/lib/checksum.c index 7e6afeae6217..5acb821849d3 100644 --- a/arch/m68k/lib/checksum.c +++ b/arch/m68k/lib/checksum.c @@ -265,8 +265,6 @@ csum_and_copy_from_user(const void __user *src, void *dst, int len) return sum; } -EXPORT_SYMBOL(csum_and_copy_from_user); - /* * copy from kernel space while checksumming, otherwise like csum_partial diff --git a/arch/powerpc/lib/checksum_wrappers.c b/arch/powerpc/lib/checksum_wrappers.c index f3999cbb2fcc..1a14c8780278 100644 --- a/arch/powerpc/lib/checksum_wrappers.c +++ b/arch/powerpc/lib/checksum_wrappers.c @@ -24,7 +24,6 @@ __wsum csum_and_copy_from_user(const void __user *src, void *dst, user_read_access_end(); return csum; } -EXPORT_SYMBOL(csum_and_copy_from_user); __wsum csum_and_copy_to_user(const void *src, void __user *dst, int len) { @@ -38,4 +37,3 @@ __wsum csum_and_copy_to_user(const void *src, void __user *dst, int len) user_write_access_end(); return csum; } -EXPORT_SYMBOL(csum_and_copy_to_user); diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c index 189344924a2b..145f9a0bde29 100644 --- a/arch/x86/lib/csum-wrappers_64.c +++ b/arch/x86/lib/csum-wrappers_64.c @@ -32,7 +32,6 @@ csum_and_copy_from_user(const void __user *src, void *dst, int len) user_access_end(); return sum; } -EXPORT_SYMBOL(csum_and_copy_from_user); /** * csum_and_copy_to_user - Copy and checksum to user space. @@ -57,7 +56,6 @@ csum_and_copy_to_user(const void *src, void __user *dst, int len) user_access_end(); return sum; } -EXPORT_SYMBOL(csum_and_copy_to_user); /** * csum_partial_copy_nocheck - Copy and checksum. From c06d7aaf2951ce7f986a879127995728d63d8577 Mon Sep 17 00:00:00 2001 From: Haowen Bai Date: Fri, 29 Apr 2022 14:38:00 -0700 Subject: [PATCH 18/65] kernel: pid_namespace: use NULL instead of using plain integer as pointer This fixes the following sparse warnings: kernel/pid_namespace.c:55:77: warning: Using plain integer as NULL pointer Link: https://lkml.kernel.org/r/1647944288-2806-1-git-send-email-baihaowen@meizu.com Signed-off-by: Haowen Bai Signed-off-by: Andrew Morton --- kernel/pid_namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index a46a3723bc66..f4f8cb0435b4 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -52,7 +52,7 @@ static struct kmem_cache *create_pid_cachep(unsigned int level) /* Name collision forces to do allocation under mutex. */ if (!*pkc) *pkc = kmem_cache_create(name, len, 0, - SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, 0); + SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); mutex_unlock(&pid_caches_mutex); /* current can fail, but someone else can succeed. */ return READ_ONCE(*pkc); From 11fb48961e5250768767612da4a303fa2f5ea504 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Fri, 29 Apr 2022 14:38:00 -0700 Subject: [PATCH 19/65] get_maintainer: Honor mailmap for in file emails Add support to also use the mailmap for 'in file' email addresses. Link: https://lkml.kernel.org/r/20220323193645.317514-1-robh@kernel.org Signed-off-by: Rob Herring Reported-by: Marc Zyngier Acked-by: Joe Perches Signed-off-by: Andrew Morton --- scripts/get_maintainer.pl | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl index 6bd5221d37b8..ab123b498fd9 100755 --- a/scripts/get_maintainer.pl +++ b/scripts/get_maintainer.pl @@ -983,6 +983,7 @@ sub get_maintainers { } foreach my $email (@file_emails) { + $email = mailmap_email($email); my ($name, $address) = parse_email($email); my $tmp_email = format_email($name, $address, $email_usename); From d4557fae77079f4e53f06712395c7a28e3734eb7 Mon Sep 17 00:00:00 2001 From: Xiaoke Wang Date: Fri, 29 Apr 2022 14:38:00 -0700 Subject: [PATCH 20/65] lib/test_meminit: optimize do_kmem_cache_rcu_persistent() test To make the test more robust, there are the following changes: 1. add a check for the return value of kmem_cache_alloc(). 2. properly release the object `buf` on several error paths. 3. release the objects of `used_objects` if we never hit `saved_ptr`. 4. destroy the created cache by default. Link: https://lkml.kernel.org/r/tencent_7CB95F1C3914BCE1CA4A61FF7C20E7CCB108@qq.com Signed-off-by: Xiaoke Wang Reviewed-by: Andrew Morton Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Marco Elver Cc: Dmitry Vyukov Cc: Andrey Ryabinin Cc: Xiaoke Wang Signed-off-by: Andrew Morton --- lib/test_meminit.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/lib/test_meminit.c b/lib/test_meminit.c index 3ca717f11397..c95db11a6906 100644 --- a/lib/test_meminit.c +++ b/lib/test_meminit.c @@ -279,13 +279,18 @@ static int __init do_kmem_cache_rcu_persistent(int size, int *total_failures) c = kmem_cache_create("test_cache", size, size, SLAB_TYPESAFE_BY_RCU, NULL); buf = kmem_cache_alloc(c, GFP_KERNEL); + if (!buf) + goto out; saved_ptr = buf; fill_with_garbage(buf, size); buf_contents = kmalloc(size, GFP_KERNEL); - if (!buf_contents) + if (!buf_contents) { + kmem_cache_free(c, buf); goto out; + } used_objects = kmalloc_array(maxiter, sizeof(void *), GFP_KERNEL); if (!used_objects) { + kmem_cache_free(c, buf); kfree(buf_contents); goto out; } @@ -306,11 +311,14 @@ static int __init do_kmem_cache_rcu_persistent(int size, int *total_failures) } } + for (iter = 0; iter < maxiter; iter++) + kmem_cache_free(c, used_objects[iter]); + free_out: - kmem_cache_destroy(c); kfree(buf_contents); kfree(used_objects); out: + kmem_cache_destroy(c); *total_failures += fail; return 1; } From 67fca000e1e173fe2c539a127ccf1bc338d5ff37 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Fri, 29 Apr 2022 14:38:00 -0700 Subject: [PATCH 21/65] lib/Kconfig.debug: remove more CONFIG_..._VALUE indirections As in "kernel/panic.c: remove CONFIG_PANIC_ON_OOPS_VALUE indirection", use the IS_ENABLED() helper rather than having a hidden config option. Link: https://lkml.kernel.org/r/20220321121301.1389693-1-linux@rasmusvillemoes.dk Signed-off-by: Rasmus Villemoes Cc: Masahiro Yamada Cc: Kees Cook Signed-off-by: Andrew Morton --- kernel/hung_task.c | 2 +- kernel/watchdog.c | 4 ++-- lib/Kconfig.debug | 21 --------------------- 3 files changed, 3 insertions(+), 24 deletions(-) diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 52501e5f7655..cff3ae8c818f 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -73,7 +73,7 @@ static unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace; * hung task is detected: */ unsigned int __read_mostly sysctl_hung_task_panic = - CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE; + IS_ENABLED(CONFIG_BOOTPARAM_HUNG_TASK_PANIC); static int hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 9166220457bc..ecb0e8346e65 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -57,7 +57,7 @@ int __read_mostly sysctl_hardlockup_all_cpu_backtrace; * Should we panic when a soft-lockup or hard-lockup occurs: */ unsigned int __read_mostly hardlockup_panic = - CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; + IS_ENABLED(CONFIG_BOOTPARAM_HARDLOCKUP_PANIC); /* * We may not want to enable hard lockup detection by default in all cases, * for example when running the kernel as a guest on a hypervisor. In these @@ -168,7 +168,7 @@ static struct cpumask watchdog_allowed_mask __read_mostly; /* Global variables, exported for sysctl */ unsigned int __read_mostly softlockup_panic = - CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; + IS_ENABLED(CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC); static bool softlockup_initialized __read_mostly; static u64 __read_mostly sample_period; diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 075cd25363ac..8fa08100dbd8 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1071,13 +1071,6 @@ config BOOTPARAM_SOFTLOCKUP_PANIC Say N if unsure. -config BOOTPARAM_SOFTLOCKUP_PANIC_VALUE - int - depends on SOFTLOCKUP_DETECTOR - range 0 1 - default 0 if !BOOTPARAM_SOFTLOCKUP_PANIC - default 1 if BOOTPARAM_SOFTLOCKUP_PANIC - config HARDLOCKUP_DETECTOR_PERF bool select SOFTLOCKUP_DETECTOR @@ -1119,13 +1112,6 @@ config BOOTPARAM_HARDLOCKUP_PANIC Say N if unsure. -config BOOTPARAM_HARDLOCKUP_PANIC_VALUE - int - depends on HARDLOCKUP_DETECTOR - range 0 1 - default 0 if !BOOTPARAM_HARDLOCKUP_PANIC - default 1 if BOOTPARAM_HARDLOCKUP_PANIC - config DETECT_HUNG_TASK bool "Detect Hung Tasks" depends on DEBUG_KERNEL @@ -1173,13 +1159,6 @@ config BOOTPARAM_HUNG_TASK_PANIC Say N if unsure. -config BOOTPARAM_HUNG_TASK_PANIC_VALUE - int - depends on DETECT_HUNG_TASK - range 0 1 - default 0 if !BOOTPARAM_HUNG_TASK_PANIC - default 1 if BOOTPARAM_HUNG_TASK_PANIC - config WQ_WATCHDOG bool "Detect Workqueue Stalls" depends on DEBUG_KERNEL From e0fa2ab3fcff42b8c2ed906f5619aae896e1e5e1 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Fri, 29 Apr 2022 14:38:00 -0700 Subject: [PATCH 22/65] lib/test_string.c: add strspn and strcspn tests Before refactoring strspn() and strcspn(), add some simple test cases. Link: https://lkml.kernel.org/r/20220328224119.3003834-1-linux@rasmusvillemoes.dk Signed-off-by: Rasmus Villemoes Cc: Andy Shevchenko Signed-off-by: Andrew Morton --- lib/test_string.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/lib/test_string.c b/lib/test_string.c index 9dfd6f52de92..c5cb92fb710e 100644 --- a/lib/test_string.c +++ b/lib/test_string.c @@ -179,6 +179,34 @@ static __init int strnchr_selftest(void) return 0; } +static __init int strspn_selftest(void) +{ + static const struct strspn_test { + const char str[16]; + const char accept[16]; + const char reject[16]; + unsigned a; + unsigned r; + } tests[] __initconst = { + { "foobar", "", "", 0, 6 }, + { "abba", "abc", "ABBA", 4, 4 }, + { "abba", "a", "b", 1, 1 }, + { "", "abc", "abc", 0, 0}, + }; + const struct strspn_test *s = tests; + size_t i, res; + + for (i = 0; i < ARRAY_SIZE(tests); ++i, ++s) { + res = strspn(s->str, s->accept); + if (res != s->a) + return 0x100 + 2*i; + res = strcspn(s->str, s->reject); + if (res != s->r) + return 0x100 + 2*i + 1; + } + return 0; +} + static __exit void string_selftest_remove(void) { } @@ -212,6 +240,11 @@ static __init int string_selftest_init(void) if (subtest) goto fail; + test = 6; + subtest = strspn_selftest(); + if (subtest) + goto fail; + pr_info("String selftests succeeded\n"); return 0; fail: From dffad91b06e0a1ee584f008565cbf2bb508a9777 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Fri, 29 Apr 2022 14:38:01 -0700 Subject: [PATCH 23/65] lib/string.c: simplify str[c]spn Use strchr(), which makes them a lot shorter, and more obviously symmetric in their treatment of accept/reject. It also saves a little bit of .text; bloat-o-meter for an arm build says Function old new delta strcspn 92 76 -16 strspn 108 76 -32 While here, also remove a stray empty line before EXPORT_SYMBOL(). Link: https://lkml.kernel.org/r/20220328224119.3003834-2-linux@rasmusvillemoes.dk Signed-off-by: Rasmus Villemoes Cc: Andy Shevchenko Signed-off-by: Andrew Morton --- lib/string.c | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/lib/string.c b/lib/string.c index 485777c9da83..6f334420f687 100644 --- a/lib/string.c +++ b/lib/string.c @@ -517,21 +517,13 @@ EXPORT_SYMBOL(strnlen); size_t strspn(const char *s, const char *accept) { const char *p; - const char *a; - size_t count = 0; for (p = s; *p != '\0'; ++p) { - for (a = accept; *a != '\0'; ++a) { - if (*p == *a) - break; - } - if (*a == '\0') - return count; - ++count; + if (!strchr(accept, *p)) + break; } - return count; + return p - s; } - EXPORT_SYMBOL(strspn); #endif @@ -544,17 +536,12 @@ EXPORT_SYMBOL(strspn); size_t strcspn(const char *s, const char *reject) { const char *p; - const char *r; - size_t count = 0; for (p = s; *p != '\0'; ++p) { - for (r = reject; *r != '\0'; ++r) { - if (*p == *r) - return count; - } - ++count; + if (strchr(reject, *p)) + break; } - return count; + return p - s; } EXPORT_SYMBOL(strcspn); #endif From d1bd5fa07667fcc3e38996ec42aef98761f23039 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Fri, 29 Apr 2022 14:38:01 -0700 Subject: [PATCH 24/65] lib: remove back_str initialization Clang static analysis reports this false positive glob.c:48:32: warning: Assigned value is garbage or undefined char const *back_pat = NULL, *back_str = back_str; ^~~~~~~~ ~~~~~~~~ back_str is set after back_pat and it's use is protected by the !back_pat check. It is not necessary to initialize back_str, so remove the initialization. Link: https://lkml.kernel.org/r/20220402131546.3383578-1-trix@redhat.com Signed-off-by: Tom Rix Reviewed-by: Nick Desaulniers Cc: Nathan Chancellor Signed-off-by: Andrew Morton --- lib/glob.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/glob.c b/lib/glob.c index 85ecbda45cd8..15b73f490720 100644 --- a/lib/glob.c +++ b/lib/glob.c @@ -45,7 +45,7 @@ bool __pure glob_match(char const *pat, char const *str) * (no exception for /), it can be easily proved that there's * never a need to backtrack multiple levels. */ - char const *back_pat = NULL, *back_str = back_str; + char const *back_pat = NULL, *back_str; /* * Loop over each token (character or class) in pat, matching From f485922d8fe4e44f6d52a5bb95a603b7c65554bb Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 29 Apr 2022 14:38:01 -0700 Subject: [PATCH 25/65] pipe: make poll_usage boolean and annotate its access Patch series "Fix data-races around epoll reported by KCSAN." This series suppresses a false positive KCSAN's message and fixes a real data-race. This patch (of 2): pipe_poll() runs locklessly and assigns 1 to poll_usage. Once poll_usage is set to 1, it never changes in other places. However, concurrent writes of a value trigger KCSAN, so let's make KCSAN happy. BUG: KCSAN: data-race in pipe_poll / pipe_poll write to 0xffff8880042f6678 of 4 bytes by task 174 on cpu 3: pipe_poll (fs/pipe.c:656) ep_item_poll.isra.0 (./include/linux/poll.h:88 fs/eventpoll.c:853) do_epoll_wait (fs/eventpoll.c:1692 fs/eventpoll.c:1806 fs/eventpoll.c:2234) __x64_sys_epoll_wait (fs/eventpoll.c:2246 fs/eventpoll.c:2241 fs/eventpoll.c:2241) do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:113) write to 0xffff8880042f6678 of 4 bytes by task 177 on cpu 1: pipe_poll (fs/pipe.c:656) ep_item_poll.isra.0 (./include/linux/poll.h:88 fs/eventpoll.c:853) do_epoll_wait (fs/eventpoll.c:1692 fs/eventpoll.c:1806 fs/eventpoll.c:2234) __x64_sys_epoll_wait (fs/eventpoll.c:2246 fs/eventpoll.c:2241 fs/eventpoll.c:2241) do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:113) Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 177 Comm: epoll_race Not tainted 5.17.0-58927-gf443e374ae13 #6 Hardware name: Red Hat KVM, BIOS 1.11.0-2.amzn2 04/01/2014 Link: https://lkml.kernel.org/r/20220322002653.33865-1-kuniyu@amazon.co.jp Link: https://lkml.kernel.org/r/20220322002653.33865-2-kuniyu@amazon.co.jp Fixes: 3b844826b6c6 ("pipe: avoid unnecessary EPOLLET wakeups under normal loads") Signed-off-by: Kuniyuki Iwashima Cc: Alexander Duyck Cc: Al Viro Cc: Davidlohr Bueso Cc: Kuniyuki Iwashima Cc: "Soheil Hassas Yeganeh" Cc: "Sridhar Samudrala" Signed-off-by: Andrew Morton --- fs/pipe.c | 2 +- include/linux/pipe_fs_i.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/pipe.c b/fs/pipe.c index e140ea150bbb..d04c3fce28a6 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -653,7 +653,7 @@ pipe_poll(struct file *filp, poll_table *wait) unsigned int head, tail; /* Epoll has some historical nasty semantics, this enables them */ - pipe->poll_usage = 1; + WRITE_ONCE(pipe->poll_usage, true); /* * Reading pipe state only -- no need for acquiring the semaphore. diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index c00c618ef290..cb0fd633a610 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -71,7 +71,7 @@ struct pipe_inode_info { unsigned int files; unsigned int r_counter; unsigned int w_counter; - unsigned int poll_usage; + bool poll_usage; struct page *tmp_page; struct fasync_struct *fasync_readers; struct fasync_struct *fasync_writers; From d679ae94fdd5d3ab00c35078f5af5f37e068b03d Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 29 Apr 2022 14:38:01 -0700 Subject: [PATCH 26/65] list: fix a data-race around ep->rdllist ep_poll() first calls ep_events_available() with no lock held and checks if ep->rdllist is empty by list_empty_careful(), which reads rdllist->prev. Thus all accesses to it need some protection to avoid store/load-tearing. Note INIT_LIST_HEAD_RCU() already has the annotation for both prev and next. Commit bf3b9f6372c4 ("epoll: Add busy poll support to epoll with socket fds.") added the first lockless ep_events_available(), and commit c5a282e9635e ("fs/epoll: reduce the scope of wq lock in epoll_wait()") made some ep_events_available() calls lockless and added single call under a lock, finally commit e59d3c64cba6 ("epoll: eliminate unnecessary lock for zero timeout") made the last ep_events_available() lockless. BUG: KCSAN: data-race in do_epoll_wait / do_epoll_wait write to 0xffff88810480c7d8 of 8 bytes by task 1802 on cpu 0: INIT_LIST_HEAD include/linux/list.h:38 [inline] list_splice_init include/linux/list.h:492 [inline] ep_start_scan fs/eventpoll.c:622 [inline] ep_send_events fs/eventpoll.c:1656 [inline] ep_poll fs/eventpoll.c:1806 [inline] do_epoll_wait+0x4eb/0xf40 fs/eventpoll.c:2234 do_epoll_pwait fs/eventpoll.c:2268 [inline] __do_sys_epoll_pwait fs/eventpoll.c:2281 [inline] __se_sys_epoll_pwait+0x12b/0x240 fs/eventpoll.c:2275 __x64_sys_epoll_pwait+0x74/0x80 fs/eventpoll.c:2275 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae read to 0xffff88810480c7d8 of 8 bytes by task 1799 on cpu 1: list_empty_careful include/linux/list.h:329 [inline] ep_events_available fs/eventpoll.c:381 [inline] ep_poll fs/eventpoll.c:1797 [inline] do_epoll_wait+0x279/0xf40 fs/eventpoll.c:2234 do_epoll_pwait fs/eventpoll.c:2268 [inline] __do_sys_epoll_pwait fs/eventpoll.c:2281 [inline] __se_sys_epoll_pwait+0x12b/0x240 fs/eventpoll.c:2275 __x64_sys_epoll_pwait+0x74/0x80 fs/eventpoll.c:2275 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0xffff88810480c7d0 -> 0xffff888103c15098 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 1799 Comm: syz-fuzzer Tainted: G W 5.17.0-rc7-syzkaller-dirty #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Link: https://lkml.kernel.org/r/20220322002653.33865-3-kuniyu@amazon.co.jp Fixes: e59d3c64cba6 ("epoll: eliminate unnecessary lock for zero timeout") Fixes: c5a282e9635e ("fs/epoll: reduce the scope of wq lock in epoll_wait()") Fixes: bf3b9f6372c4 ("epoll: Add busy poll support to epoll with socket fds.") Signed-off-by: Kuniyuki Iwashima Reported-by: syzbot+bdd6e38a1ed5ee58d8bd@syzkaller.appspotmail.com Cc: Al Viro , Andrew Morton Cc: Kuniyuki Iwashima Cc: Kuniyuki Iwashima Cc: "Soheil Hassas Yeganeh" Cc: Davidlohr Bueso Cc: "Sridhar Samudrala" Cc: Alexander Duyck Signed-off-by: Andrew Morton --- include/linux/list.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/list.h b/include/linux/list.h index dd6c2041d09c..d7d2bfa1a365 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -35,7 +35,7 @@ static inline void INIT_LIST_HEAD(struct list_head *list) { WRITE_ONCE(list->next, list); - list->prev = list; + WRITE_ONCE(list->prev, list); } #ifdef CONFIG_DEBUG_LIST @@ -306,7 +306,7 @@ static inline int list_empty(const struct list_head *head) static inline void list_del_init_careful(struct list_head *entry) { __list_del_entry(entry); - entry->prev = entry; + WRITE_ONCE(entry->prev, entry); smp_store_release(&entry->next, entry); } @@ -326,7 +326,7 @@ static inline void list_del_init_careful(struct list_head *entry) static inline int list_empty_careful(const struct list_head *head) { struct list_head *next = smp_load_acquire(&head->next); - return list_is_head(next, head) && (next == head->prev); + return list_is_head(next, head) && (next == READ_ONCE(head->prev)); } /** From 7374fa33dc2dd76b71999f8fd236e73b21161030 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 29 Apr 2022 14:38:01 -0700 Subject: [PATCH 27/65] init/Kconfig: remove USELIB syscall by default The uselib syscall has been long deprecated. There's no need to keep this enabled by default under X86_32. Link: https://lkml.kernel.org/r/20220412212519.4113845-1-keescook@chromium.org Signed-off-by: Kees Cook Reviewed-by: Nathan Chancellor Cc: Masahiro Yamada Signed-off-by: Andrew Morton --- init/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index ddcbefe535e9..5cddb9ba0eef 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -435,8 +435,8 @@ config CROSS_MEMORY_ATTACH See the man page for more details. config USELIB - bool "uselib syscall" - def_bool ALPHA || M68K || SPARC || X86_32 || IA32_EMULATION + bool "uselib syscall (for libc5 and earlier)" + default ALPHA || M68K || SPARC help This option enables the uselib syscall, a system call used in the dynamic linker from libc5 and earlier. glibc does not use this From 3fbb6b784acb4f308e2bc93dbc57761e8b6d9e80 Mon Sep 17 00:00:00 2001 From: Yubo Feng Date: Fri, 29 Apr 2022 14:38:02 -0700 Subject: [PATCH 28/65] fatfs: remove redundant judgment iput() has already judged the incoming parameter, so there is no need to repeat outside. Link: https://lkml.kernel.org/r/1648265418-76563-1-git-send-email-fengyubo3@huawei.com Signed-off-by: Yubo Feng Reported-by: Hulk Robot Acked-by: OGAWA Hirofumi Signed-off-by: Andrew Morton --- fs/fat/inode.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/fat/inode.c b/fs/fat/inode.c index bf6051bdf1d1..cb698a827c9a 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -1889,10 +1889,8 @@ out_invalid: fat_msg(sb, KERN_INFO, "Can't find a valid FAT filesystem"); out_fail: - if (fsinfo_inode) - iput(fsinfo_inode); - if (fat_inode) - iput(fat_inode); + iput(fsinfo_inode); + iput(fat_inode); unload_nls(sbi->nls_io); unload_nls(sbi->nls_disk); fat_reset_iocharset(&sbi->options); From e057aaec34ae7534ac8f5cc4f880aa7de8402852 Mon Sep 17 00:00:00 2001 From: Jonathan Lassoff Date: Fri, 29 Apr 2022 14:38:02 -0700 Subject: [PATCH 29/65] fatfs: add FAT messages to printk index In order for end users to quickly react to new issues that come up in production, it is proving useful to leverage the printk indexing system. This printk index enables kernel developers to use calls to printk() with changeable ad-hoc format strings (as they always have; no change of expectations), while enabling end users to examine format strings to detect changes. Since end users are using regular expressions to match messages printed through printk(), being able to detect changes in chosen format strings from release to release provides a useful signal to review printk()-matching regular expressions for any necessary updates. So that detailed FAT messages are captured by this printk index, this patch wraps fat_msg with a macro. [akpm@linux-foundation.org: coding-style cleanups] Link: https://lkml.kernel.org/r/8aaa2dd7995e820292bb40d2120ab69756662c65.1648688136.git.jof@thejof.com Signed-off-by: Jonathan Lassoff Acked-by: OGAWA Hirofumi Reviewed-by: Petr Mladek Tested-by: Petr Mladek Signed-off-by: Andrew Morton --- fs/fat/fat.h | 9 ++++++++- fs/fat/misc.c | 14 ++++++++++---- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/fs/fat/fat.h b/fs/fat/fat.h index 02d4d4234956..2cf85a6e0d99 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -433,8 +433,15 @@ void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...); __fat_fs_error(sb, 1, fmt , ## args) #define fat_fs_error_ratelimit(sb, fmt, args...) \ __fat_fs_error(sb, __ratelimit(&MSDOS_SB(sb)->ratelimit), fmt , ## args) + +#define FAT_PRINTK_PREFIX "%sFAT-fs (%s): " +#define fat_msg(sb, level, fmt, args...) \ +do { \ + printk_index_subsys_emit(FAT_PRINTK_PREFIX, level, fmt, ##args);\ + _fat_msg(sb, level, fmt, ##args); \ +} while (0) __printf(3, 4) __cold -void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...); +void _fat_msg(struct super_block *sb, const char *level, const char *fmt, ...); #define fat_msg_ratelimit(sb, level, fmt, args...) \ do { \ if (__ratelimit(&MSDOS_SB(sb)->ratelimit)) \ diff --git a/fs/fat/misc.c b/fs/fat/misc.c index 91ca3c304211..855477d89f41 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -42,10 +42,16 @@ void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...) EXPORT_SYMBOL_GPL(__fat_fs_error); /** - * fat_msg() - print preformated FAT specific messages. Every thing what is - * not fat_fs_error() should be fat_msg(). + * _fat_msg() - Print a preformatted FAT message based on a superblock. + * @sb: A pointer to a &struct super_block + * @level: A Kernel printk level constant + * @fmt: The printf-style format string to print. + * + * Everything that is not fat_fs_error() should be fat_msg(). + * + * fat_msg() wraps _fat_msg() for printk indexing. */ -void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...) +void _fat_msg(struct super_block *sb, const char *level, const char *fmt, ...) { struct va_format vaf; va_list args; @@ -53,7 +59,7 @@ void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...) va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; - printk("%sFAT-fs (%s): %pV\n", level, sb->s_id, &vaf); + _printk(FAT_PRINTK_PREFIX "%pV\n", level, sb->s_id, &vaf); va_end(args); } From 183c3237c928109d2008c0456dff508baf692b20 Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Fri, 29 Apr 2022 14:38:02 -0700 Subject: [PATCH 30/65] fat: add ratelimit to fat*_ent_bread() fat*_ent_bread() can be the cause of too many report on I/O error path. So use fat_msg_ratelimit() instead. Link: https://lkml.kernel.org/r/87bkxogfeq.fsf@mail.parknet.co.jp Signed-off-by: OGAWA Hirofumi Reported-by: qianfan Tested-by: qianfan Signed-off-by: Andrew Morton --- fs/fat/fatent.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index 978ac6751aeb..1db348f8f887 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -94,7 +94,8 @@ static int fat12_ent_bread(struct super_block *sb, struct fat_entry *fatent, err_brelse: brelse(bhs[0]); err: - fat_msg(sb, KERN_ERR, "FAT read failed (blocknr %llu)", (llu)blocknr); + fat_msg_ratelimit(sb, KERN_ERR, "FAT read failed (blocknr %llu)", + (llu)blocknr); return -EIO; } @@ -107,8 +108,8 @@ static int fat_ent_bread(struct super_block *sb, struct fat_entry *fatent, fatent->fat_inode = MSDOS_SB(sb)->fat_inode; fatent->bhs[0] = sb_bread(sb, blocknr); if (!fatent->bhs[0]) { - fat_msg(sb, KERN_ERR, "FAT read failed (blocknr %llu)", - (llu)blocknr); + fat_msg_ratelimit(sb, KERN_ERR, "FAT read failed (blocknr %llu)", + (llu)blocknr); return -EIO; } fatent->nr_bhs = 1; From f26b2afd53e70db67be8252d340b4a1387ec8b55 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Fri, 29 Apr 2022 14:38:02 -0700 Subject: [PATCH 31/65] ptrace: remove redudant check of #ifdef PTRACE_SINGLESTEP Patch series "ptrace: do some cleanup". This patch (of 3): PTRACE_SINGLESTEP is always defined as 9 in include/uapi/linux/ptrace.h, remove redudant check of #ifdef PTRACE_SINGLESTEP. Link: https://lkml.kernel.org/r/1649240981-11024-2-git-send-email-yangtiezhu@loongson.cn Signed-off-by: Tiezhu Yang Cc: Oleg Nesterov Signed-off-by: Andrew Morton --- kernel/ptrace.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/kernel/ptrace.c b/kernel/ptrace.c index ccc4b465775b..49c29baf9907 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -829,11 +829,7 @@ static long ptrace_get_rseq_configuration(struct task_struct *task, } #endif -#ifdef PTRACE_SINGLESTEP #define is_singlestep(request) ((request) == PTRACE_SINGLESTEP) -#else -#define is_singlestep(request) 0 -#endif #ifdef PTRACE_SINGLEBLOCK #define is_singleblock(request) ((request) == PTRACE_SINGLEBLOCK) @@ -1221,9 +1217,7 @@ int ptrace_request(struct task_struct *child, long request, } #endif -#ifdef PTRACE_SINGLESTEP case PTRACE_SINGLESTEP: -#endif #ifdef PTRACE_SINGLEBLOCK case PTRACE_SINGLEBLOCK: #endif From a9866bef5171c859cfabc1155c594d28f194aa23 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Fri, 29 Apr 2022 14:38:02 -0700 Subject: [PATCH 32/65] ptrace: fix wrong comment of PT_DTRACE PT_DTRACE is only used on um now, fix the wrong comment. Link: https://lkml.kernel.org/r/1649240981-11024-3-git-send-email-yangtiezhu@loongson.cn Signed-off-by: Tiezhu Yang Cc: Oleg Nesterov Signed-off-by: Andrew Morton --- include/linux/ptrace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 15b3d176b6b4..db4509587d2c 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -30,7 +30,7 @@ extern int ptrace_access_vm(struct task_struct *tsk, unsigned long addr, #define PT_SEIZED 0x00010000 /* SEIZE used, enable new behavior */ #define PT_PTRACED 0x00000001 -#define PT_DTRACE 0x00000002 /* delayed trace (used on m68k, i386) */ +#define PT_DTRACE 0x00000002 /* delayed trace (used on um) */ #define PT_OPT_FLAG_SHIFT 3 /* PT_TRACE_* event enable flags */ From f224cabeedb274db8e64824a50765e2eabacca90 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Fri, 29 Apr 2022 14:38:03 -0700 Subject: [PATCH 33/65] MAINTAINERS: remove redundant file of PTRACE SUPPORT entry In MAINTAINERS PTRACE SUPPORT entry, the file include/uapi/linux/ptrace.h is redundant, remove it. Link: https://lkml.kernel.org/r/1649240981-11024-4-git-send-email-yangtiezhu@loongson.cn Signed-off-by: Tiezhu Yang Cc: Oleg Nesterov Signed-off-by: Andrew Morton --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 2647adf30569..886265f04061 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15963,7 +15963,6 @@ F: include/asm-generic/syscall.h F: include/linux/ptrace.h F: include/linux/regset.h F: include/uapi/linux/ptrace.h -F: include/uapi/linux/ptrace.h F: kernel/ptrace.c PULSE8-CEC DRIVER From 16b0b7adabfb5564a77fa35917afe08decd55b29 Mon Sep 17 00:00:00 2001 From: Michal Orzel Date: Fri, 29 Apr 2022 14:38:03 -0700 Subject: [PATCH 34/65] kexec: remove redundant assignments Get rid of redundant assignments which end up in values not being read either because they are overwritten or the function ends. Reported by clang-tidy [deadcode.DeadStores] Link: https://lkml.kernel.org/r/20220326180948.192154-1-michalorzel.eng@gmail.com Signed-off-by: Michal Orzel Acked-by: Baoquan He Cc: Eric Biederman Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Michal Orzel Signed-off-by: Andrew Morton --- kernel/kexec_core.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 68480f731192..d08904a27362 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -768,7 +768,6 @@ static struct page *kimage_alloc_page(struct kimage *image, kimage_free_pages(old_page); continue; } - addr = old_addr; page = old_page; break; } @@ -788,7 +787,6 @@ static int kimage_load_normal_segment(struct kimage *image, unsigned char __user *buf = NULL; unsigned char *kbuf = NULL; - result = 0; if (image->file_mode) kbuf = segment->kbuf; else From f8323a0cb9a66d8d8747c463211392a2cfc4c1dc Mon Sep 17 00:00:00 2001 From: Jakob Koschel Date: Fri, 29 Apr 2022 14:38:03 -0700 Subject: [PATCH 35/65] rapidio: remove unnecessary use of list iterator req->map is set in the valid case and always equals 'map' if the break was hit. It therefore is unnecessary to use the list iterator variable and the use of 'map' can be replaced with req->map. This is done in preparation to limit the scope of a list iterator to the list traversal loop [1]. Link: https://lore.kernel.org/all/YhdfEIwI4EdtHdym@kroah.com/ Link: https://lkml.kernel.org/r/20220319203344.2547702-1-jakobkoschel@gmail.com Signed-off-by: Jakob Koschel Reviewed-by: John Hubbard Cc: Matt Porter Cc: Alexandre Bounine Cc: Kees Cook Cc: Mike Rapoport Cc: "Brian Johannesmeyer" Cc: Cristiano Giuffrida Cc: "Bos, H.J." Signed-off-by: Andrew Morton --- drivers/rapidio/devices/rio_mport_cdev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/rapidio/devices/rio_mport_cdev.c b/drivers/rapidio/devices/rio_mport_cdev.c index 7df466e22282..2cdc054e53a5 100644 --- a/drivers/rapidio/devices/rio_mport_cdev.c +++ b/drivers/rapidio/devices/rio_mport_cdev.c @@ -915,7 +915,7 @@ rio_dma_transfer(struct file *filp, u32 transfer_mode, goto err_req; } - if (xfer->length + xfer->offset > map->size) { + if (xfer->length + xfer->offset > req->map->size) { ret = -EINVAL; goto err_req; } @@ -927,7 +927,7 @@ rio_dma_transfer(struct file *filp, u32 transfer_mode, } sg_set_buf(req->sgt.sgl, - map->virt_addr + (baddr - map->phys_addr) + + req->map->virt_addr + (baddr - req->map->phys_addr) + xfer->offset, xfer->length); } From 0e0af57e0e91b304f36b7d1dba859e3c04094273 Mon Sep 17 00:00:00 2001 From: "Dr. Thomas Orgis" Date: Fri, 29 Apr 2022 14:38:03 -0700 Subject: [PATCH 36/65] taskstats: version 12 with thread group and exe info The task exit struct needs some crucial information to be able to provide an enhanced version of process and thread accounting. This change provides: 1. ac_tgid in additon to ac_pid 2. thread group execution walltime in ac_tgetime 3. flag AGROUP in ac_flag to indicate the last task in a thread group / process 4. device ID and inode of task's /proc/self/exe in ac_exe_dev and ac_exe_inode 5. tools/accounting/procacct as demonstrator When a task exits, taskstats are reported to userspace including the task's pid and ppid, but without the id of the thread group this task is part of. Without the tgid, the stats of single tasks cannot be correlated to each other as a thread group (process). The taskstats documentation suggests that on process exit a data set consisting of accumulated stats for the whole group is produced. But such an additional set of stats is only produced for actually multithreaded processes, not groups that had only one thread, and also those stats only contain data about delay accounting and not the more basic information about CPU and memory resource usage. Adding the AGROUP flag to be set when the last task of a group exited enables determination of process end also for single-threaded processes. My applicaton basically does enhanced process accounting with summed cputime, biggest maxrss, tasks per process. The data is not available with the traditional BSD process accounting (which is not designed to be extensible) and the taskstats interface allows more efficient on-the-fly grouping and summing of the stats, anyway, without intermediate disk writes. Furthermore, I do carry statistics on which exact program binary is used how often with associated resources, getting a picture on how important which parts of a collection of installed scientific software in different versions are, and how well they put load on the machine. This is enabled by providing information on /proc/self/exe for each task. I assume the two 64-bit fields for device ID and inode are more appropriate than the possibly large resolved path to keep the data volume down. Add the tgid to the stats to complete task identification, the flag AGROUP to mark the last task of a group, the group wallclock time, and inode-based identification of the associated executable file. Add tools/accounting/procacct.c as a simplified fork of getdelays.c to demonstrate process and thread accounting. [thomas.orgis@uni-hamburg.de: fix version number in comment] Link: https://lkml.kernel.org/r/20220405003601.7a5f6008@plasteblaster Link: https://lkml.kernel.org/r/20220331004106.64e5616b@plasteblaster Signed-off-by: Dr. Thomas Orgis Reviewed-by: Ismael Luceno Cc: Balbir Singh Cc: Eric W. Biederman Cc: xu xin Cc: Yang Yang Signed-off-by: Andrew Morton --- include/uapi/linux/acct.h | 3 +- include/uapi/linux/taskstats.h | 24 +- kernel/taskstats.c | 23 ++ kernel/tsacct.c | 10 +- tools/accounting/.gitignore | 1 + tools/accounting/Makefile | 2 +- tools/accounting/procacct.c | 417 +++++++++++++++++++++++++++++++++ 7 files changed, 473 insertions(+), 7 deletions(-) create mode 100644 tools/accounting/procacct.c diff --git a/include/uapi/linux/acct.h b/include/uapi/linux/acct.h index 985b89068591..0e591152aa8a 100644 --- a/include/uapi/linux/acct.h +++ b/include/uapi/linux/acct.h @@ -103,12 +103,13 @@ struct acct_v3 /* * accounting flags */ - /* bit set when the process ... */ + /* bit set when the process/task ... */ #define AFORK 0x01 /* ... executed fork, but did not exec */ #define ASU 0x02 /* ... used super-user privileges */ #define ACOMPAT 0x04 /* ... used compatibility mode (VAX only not used) */ #define ACORE 0x08 /* ... dumped core */ #define AXSIG 0x10 /* ... was killed by a signal */ +#define AGROUP 0x20 /* ... was the last task of the process (task group) */ #if defined(__BYTE_ORDER) ? __BYTE_ORDER == __BIG_ENDIAN : defined(__BIG_ENDIAN) #define ACCT_BYTEORDER 0x80 /* accounting file is big endian */ diff --git a/include/uapi/linux/taskstats.h b/include/uapi/linux/taskstats.h index 12327d32378f..736154171489 100644 --- a/include/uapi/linux/taskstats.h +++ b/include/uapi/linux/taskstats.h @@ -34,7 +34,7 @@ */ -#define TASKSTATS_VERSION 11 +#define TASKSTATS_VERSION 12 #define TS_COMM_LEN 32 /* should be >= TASK_COMM_LEN * in linux/sched.h */ @@ -48,7 +48,8 @@ struct taskstats { __u32 ac_exitcode; /* Exit status */ /* The accounting flags of a task as defined in - * Defined values are AFORK, ASU, ACOMPAT, ACORE, and AXSIG. + * Defined values are AFORK, ASU, ACOMPAT, ACORE, AXSIG, and AGROUP. + * (AGROUP since version 12). */ __u8 ac_flag; /* Record flags */ __u8 ac_nice; /* task_nice */ @@ -173,9 +174,26 @@ struct taskstats { /* v10: 64-bit btime to avoid overflow */ __u64 ac_btime64; /* 64-bit begin time */ - /* Delay waiting for memory compact */ + /* v11: Delay waiting for memory compact */ __u64 compact_count; __u64 compact_delay_total; + + /* v12 begin */ + __u32 ac_tgid; /* thread group ID */ + /* Thread group walltime up to now. This is total process walltime if + * AGROUP flag is set. + */ + __u64 ac_tgetime __attribute__((aligned(8))); + /* Lightweight information to identify process binary files. + * This leaves userspace to match this to a file system path, using + * MAJOR() and MINOR() macros to identify a device and mount point, + * the inode to identify the executable file. This is /proc/self/exe + * at the end, so matching the most recent exec(). Values are zero + * for kernel threads. + */ + __u64 ac_exe_dev; /* program binary device ID */ + __u64 ac_exe_inode; /* program binary inode number */ + /* v12 end */ }; diff --git a/kernel/taskstats.c b/kernel/taskstats.c index bcac5a9043aa..72415e22342b 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -153,6 +154,23 @@ static void send_cpu_listeners(struct sk_buff *skb, up_write(&listeners->sem); } +static void exe_add_tsk(struct taskstats *stats, struct task_struct *tsk) +{ + /* No idea if I'm allowed to access that here, now. */ + struct file *exe_file = get_task_exe_file(tsk); + + if (exe_file) { + /* Following cp_new_stat64() in stat.c . */ + stats->ac_exe_dev = + huge_encode_dev(exe_file->f_inode->i_sb->s_dev); + stats->ac_exe_inode = exe_file->f_inode->i_ino; + fput(exe_file); + } else { + stats->ac_exe_dev = 0; + stats->ac_exe_inode = 0; + } +} + static void fill_stats(struct user_namespace *user_ns, struct pid_namespace *pid_ns, struct task_struct *tsk, struct taskstats *stats) @@ -175,6 +193,9 @@ static void fill_stats(struct user_namespace *user_ns, /* fill in extended acct fields */ xacct_add_tsk(stats, tsk); + + /* add executable info */ + exe_add_tsk(stats, tsk); } static int fill_stats_for_pid(pid_t pid, struct taskstats *stats) @@ -620,6 +641,8 @@ void taskstats_exit(struct task_struct *tsk, int group_dead) goto err; fill_stats(&init_user_ns, &init_pid_ns, tsk, stats); + if (group_dead) + stats->ac_flag |= AGROUP; /* * Doesn't matter if tsk is the leader or the last group member leaving diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 1d261fbe367b..4252f0645b9e 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -23,15 +23,20 @@ void bacct_add_tsk(struct user_namespace *user_ns, { const struct cred *tcred; u64 utime, stime, utimescaled, stimescaled; - u64 delta; + u64 now_ns, delta; time64_t btime; BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN); /* calculate task elapsed time in nsec */ - delta = ktime_get_ns() - tsk->start_time; + now_ns = ktime_get_ns(); + /* store whole group time first */ + delta = now_ns - tsk->group_leader->start_time; /* Convert to micro seconds */ do_div(delta, NSEC_PER_USEC); + stats->ac_tgetime = delta; + delta = now_ns - tsk->start_time; + do_div(delta, NSEC_PER_USEC); stats->ac_etime = delta; /* Convert to seconds for btime (note y2106 limit) */ btime = ktime_get_real_seconds() - div_u64(delta, USEC_PER_SEC); @@ -51,6 +56,7 @@ void bacct_add_tsk(struct user_namespace *user_ns, stats->ac_nice = task_nice(tsk); stats->ac_sched = tsk->policy; stats->ac_pid = task_pid_nr_ns(tsk, pid_ns); + stats->ac_tgid = task_tgid_nr_ns(tsk, pid_ns); rcu_read_lock(); tcred = __task_cred(tsk); stats->ac_uid = from_kuid_munged(user_ns, tcred->uid); diff --git a/tools/accounting/.gitignore b/tools/accounting/.gitignore index c45fb4ed4309..522a690aaf3d 100644 --- a/tools/accounting/.gitignore +++ b/tools/accounting/.gitignore @@ -1,2 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only getdelays +procacct diff --git a/tools/accounting/Makefile b/tools/accounting/Makefile index 03687f19cbb1..11def1ad046c 100644 --- a/tools/accounting/Makefile +++ b/tools/accounting/Makefile @@ -2,7 +2,7 @@ CC := $(CROSS_COMPILE)gcc CFLAGS := -I../../usr/include -PROGS := getdelays +PROGS := getdelays procacct all: $(PROGS) diff --git a/tools/accounting/procacct.c b/tools/accounting/procacct.c new file mode 100644 index 000000000000..8353d3237e50 --- /dev/null +++ b/tools/accounting/procacct.c @@ -0,0 +1,417 @@ +// SPDX-License-Identifier: GPL-2.0 +/* procacct.c + * + * Demonstrator of fetching resource data on task exit, as a way + * to accumulate accurate program resource usage statistics, without + * prior identification of the programs. For that, the fields for + * device and inode of the program executable binary file are also + * extracted in addition to the command string. + * + * The TGID together with the PID and the AGROUP flag allow + * identification of threads in a process and single-threaded processes. + * The ac_tgetime field gives proper whole-process walltime. + * + * Written (changed) by Thomas Orgis, University of Hamburg in 2022 + * + * This is a cheap derivation (inheriting the style) of getdelays.c: + * + * Utility to get per-pid and per-tgid delay accounting statistics + * Also illustrates usage of the taskstats interface + * + * Copyright (C) Shailabh Nagar, IBM Corp. 2005 + * Copyright (C) Balbir Singh, IBM Corp. 2006 + * Copyright (c) Jay Lan, SGI. 2006 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +/* + * Generic macros for dealing with netlink sockets. Might be duplicated + * elsewhere. It is recommended that commercial grade applications use + * libnl or libnetlink and use the interfaces provided by the library + */ +#define GENLMSG_DATA(glh) ((void *)(NLMSG_DATA(glh) + GENL_HDRLEN)) +#define GENLMSG_PAYLOAD(glh) (NLMSG_PAYLOAD(glh, 0) - GENL_HDRLEN) +#define NLA_DATA(na) ((void *)((char *)(na) + NLA_HDRLEN)) +#define NLA_PAYLOAD(len) (len - NLA_HDRLEN) + +#define err(code, fmt, arg...) \ + do { \ + fprintf(stderr, fmt, ##arg); \ + exit(code); \ + } while (0) + +int rcvbufsz; +char name[100]; +int dbg; +int print_delays; +int print_io_accounting; +int print_task_context_switch_counts; + +#define PRINTF(fmt, arg...) { \ + if (dbg) { \ + printf(fmt, ##arg); \ + } \ + } + +/* Maximum size of response requested or message sent */ +#define MAX_MSG_SIZE 1024 +/* Maximum number of cpus expected to be specified in a cpumask */ +#define MAX_CPUS 32 + +struct msgtemplate { + struct nlmsghdr n; + struct genlmsghdr g; + char buf[MAX_MSG_SIZE]; +}; + +char cpumask[100+6*MAX_CPUS]; + +static void usage(void) +{ + fprintf(stderr, "procacct [-v] [-w logfile] [-r bufsize] [-m cpumask]\n"); + fprintf(stderr, " -v: debug on\n"); +} + +/* + * Create a raw netlink socket and bind + */ +static int create_nl_socket(int protocol) +{ + int fd; + struct sockaddr_nl local; + + fd = socket(AF_NETLINK, SOCK_RAW, protocol); + if (fd < 0) + return -1; + + if (rcvbufsz) + if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, + &rcvbufsz, sizeof(rcvbufsz)) < 0) { + fprintf(stderr, "Unable to set socket rcv buf size to %d\n", + rcvbufsz); + goto error; + } + + memset(&local, 0, sizeof(local)); + local.nl_family = AF_NETLINK; + + if (bind(fd, (struct sockaddr *) &local, sizeof(local)) < 0) + goto error; + + return fd; +error: + close(fd); + return -1; +} + + +static int send_cmd(int sd, __u16 nlmsg_type, __u32 nlmsg_pid, + __u8 genl_cmd, __u16 nla_type, + void *nla_data, int nla_len) +{ + struct nlattr *na; + struct sockaddr_nl nladdr; + int r, buflen; + char *buf; + + struct msgtemplate msg; + + msg.n.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN); + msg.n.nlmsg_type = nlmsg_type; + msg.n.nlmsg_flags = NLM_F_REQUEST; + msg.n.nlmsg_seq = 0; + msg.n.nlmsg_pid = nlmsg_pid; + msg.g.cmd = genl_cmd; + msg.g.version = 0x1; + na = (struct nlattr *) GENLMSG_DATA(&msg); + na->nla_type = nla_type; + na->nla_len = nla_len + 1 + NLA_HDRLEN; + memcpy(NLA_DATA(na), nla_data, nla_len); + msg.n.nlmsg_len += NLMSG_ALIGN(na->nla_len); + + buf = (char *) &msg; + buflen = msg.n.nlmsg_len; + memset(&nladdr, 0, sizeof(nladdr)); + nladdr.nl_family = AF_NETLINK; + while ((r = sendto(sd, buf, buflen, 0, (struct sockaddr *) &nladdr, + sizeof(nladdr))) < buflen) { + if (r > 0) { + buf += r; + buflen -= r; + } else if (errno != EAGAIN) + return -1; + } + return 0; +} + + +/* + * Probe the controller in genetlink to find the family id + * for the TASKSTATS family + */ +static int get_family_id(int sd) +{ + struct { + struct nlmsghdr n; + struct genlmsghdr g; + char buf[256]; + } ans; + + int id = 0, rc; + struct nlattr *na; + int rep_len; + + strcpy(name, TASKSTATS_GENL_NAME); + rc = send_cmd(sd, GENL_ID_CTRL, getpid(), CTRL_CMD_GETFAMILY, + CTRL_ATTR_FAMILY_NAME, (void *)name, + strlen(TASKSTATS_GENL_NAME)+1); + if (rc < 0) + return 0; /* sendto() failure? */ + + rep_len = recv(sd, &ans, sizeof(ans), 0); + if (ans.n.nlmsg_type == NLMSG_ERROR || + (rep_len < 0) || !NLMSG_OK((&ans.n), rep_len)) + return 0; + + na = (struct nlattr *) GENLMSG_DATA(&ans); + na = (struct nlattr *) ((char *) na + NLA_ALIGN(na->nla_len)); + if (na->nla_type == CTRL_ATTR_FAMILY_ID) + id = *(__u16 *) NLA_DATA(na); + + return id; +} + +#define average_ms(t, c) (t / 1000000ULL / (c ? c : 1)) + +static void print_procacct(struct taskstats *t) +{ + /* First letter: T is a mere thread, G the last in a group, U unknown. */ + printf( + "%c pid=%lu tgid=%lu uid=%lu wall=%llu gwall=%llu cpu=%llu vmpeak=%llu rsspeak=%llu dev=%lu:%lu inode=%llu comm=%s\n" + , t->version >= 12 ? (t->ac_flag & AGROUP ? 'P' : 'T') : '?' + , (unsigned long)t->ac_pid + , (unsigned long)(t->version >= 12 ? t->ac_tgid : 0) + , (unsigned long)t->ac_uid + , (unsigned long long)t->ac_etime + , (unsigned long long)(t->version >= 12 ? t->ac_tgetime : 0) + , (unsigned long long)(t->ac_utime+t->ac_stime) + , (unsigned long long)t->hiwater_vm + , (unsigned long long)t->hiwater_rss + , (unsigned long)(t->version >= 12 ? MAJOR(t->ac_exe_dev) : 0) + , (unsigned long)(t->version >= 12 ? MINOR(t->ac_exe_dev) : 0) + , (unsigned long long)(t->version >= 12 ? t->ac_exe_inode : 0) + , t->ac_comm + ); +} + +void handle_aggr(int mother, struct nlattr *na, int fd) +{ + int aggr_len = NLA_PAYLOAD(na->nla_len); + int len2 = 0; + pid_t rtid = 0; + + na = (struct nlattr *) NLA_DATA(na); + while (len2 < aggr_len) { + switch (na->nla_type) { + case TASKSTATS_TYPE_PID: + rtid = *(int *) NLA_DATA(na); + PRINTF("PID\t%d\n", rtid); + break; + case TASKSTATS_TYPE_TGID: + rtid = *(int *) NLA_DATA(na); + PRINTF("TGID\t%d\n", rtid); + break; + case TASKSTATS_TYPE_STATS: + if (mother == TASKSTATS_TYPE_AGGR_PID) + print_procacct((struct taskstats *) NLA_DATA(na)); + if (fd) { + if (write(fd, NLA_DATA(na), na->nla_len) < 0) + err(1, "write error\n"); + } + break; + case TASKSTATS_TYPE_NULL: + break; + default: + fprintf(stderr, "Unknown nested nla_type %d\n", + na->nla_type); + break; + } + len2 += NLA_ALIGN(na->nla_len); + na = (struct nlattr *)((char *)na + + NLA_ALIGN(na->nla_len)); + } +} + +int main(int argc, char *argv[]) +{ + int c, rc, rep_len, aggr_len, len2; + int cmd_type = TASKSTATS_CMD_ATTR_UNSPEC; + __u16 id; + __u32 mypid; + + struct nlattr *na; + int nl_sd = -1; + int len = 0; + pid_t tid = 0; + + int fd = 0; + int write_file = 0; + int maskset = 0; + char *logfile = NULL; + int containerset = 0; + char *containerpath = NULL; + int cfd = 0; + int forking = 0; + sigset_t sigset; + + struct msgtemplate msg; + + while (!forking) { + c = getopt(argc, argv, "m:vr:"); + if (c < 0) + break; + + switch (c) { + case 'w': + logfile = strdup(optarg); + printf("write to file %s\n", logfile); + write_file = 1; + break; + case 'r': + rcvbufsz = atoi(optarg); + printf("receive buf size %d\n", rcvbufsz); + if (rcvbufsz < 0) + err(1, "Invalid rcv buf size\n"); + break; + case 'm': + strncpy(cpumask, optarg, sizeof(cpumask)); + cpumask[sizeof(cpumask) - 1] = '\0'; + maskset = 1; + break; + case 'v': + printf("debug on\n"); + dbg = 1; + break; + default: + usage(); + exit(-1); + } + } + if (!maskset) { + maskset = 1; + strncpy(cpumask, "1", sizeof(cpumask)); + cpumask[sizeof(cpumask) - 1] = '\0'; + } + printf("cpumask %s maskset %d\n", cpumask, maskset); + + if (write_file) { + fd = open(logfile, O_WRONLY | O_CREAT | O_TRUNC, 0644); + if (fd == -1) { + perror("Cannot open output file\n"); + exit(1); + } + } + + nl_sd = create_nl_socket(NETLINK_GENERIC); + if (nl_sd < 0) + err(1, "error creating Netlink socket\n"); + + mypid = getpid(); + id = get_family_id(nl_sd); + if (!id) { + fprintf(stderr, "Error getting family id, errno %d\n", errno); + goto err; + } + PRINTF("family id %d\n", id); + + if (maskset) { + rc = send_cmd(nl_sd, id, mypid, TASKSTATS_CMD_GET, + TASKSTATS_CMD_ATTR_REGISTER_CPUMASK, + &cpumask, strlen(cpumask) + 1); + PRINTF("Sent register cpumask, retval %d\n", rc); + if (rc < 0) { + fprintf(stderr, "error sending register cpumask\n"); + goto err; + } + } + + do { + rep_len = recv(nl_sd, &msg, sizeof(msg), 0); + PRINTF("received %d bytes\n", rep_len); + + if (rep_len < 0) { + fprintf(stderr, "nonfatal reply error: errno %d\n", + errno); + continue; + } + if (msg.n.nlmsg_type == NLMSG_ERROR || + !NLMSG_OK((&msg.n), rep_len)) { + struct nlmsgerr *err = NLMSG_DATA(&msg); + + fprintf(stderr, "fatal reply error, errno %d\n", + err->error); + goto done; + } + + PRINTF("nlmsghdr size=%zu, nlmsg_len=%d, rep_len=%d\n", + sizeof(struct nlmsghdr), msg.n.nlmsg_len, rep_len); + + + rep_len = GENLMSG_PAYLOAD(&msg.n); + + na = (struct nlattr *) GENLMSG_DATA(&msg); + len = 0; + while (len < rep_len) { + len += NLA_ALIGN(na->nla_len); + int mother = na->nla_type; + + PRINTF("mother=%i\n", mother); + switch (na->nla_type) { + case TASKSTATS_TYPE_AGGR_PID: + case TASKSTATS_TYPE_AGGR_TGID: + /* For nested attributes, na follows */ + handle_aggr(mother, na, fd); + break; + default: + fprintf(stderr, "Unexpected nla_type %d\n", + na->nla_type); + case TASKSTATS_TYPE_NULL: + break; + } + na = (struct nlattr *) (GENLMSG_DATA(&msg) + len); + } + } while (1); +done: + if (maskset) { + rc = send_cmd(nl_sd, id, mypid, TASKSTATS_CMD_GET, + TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK, + &cpumask, strlen(cpumask) + 1); + printf("Sent deregister mask, retval %d\n", rc); + if (rc < 0) + err(rc, "error sending deregister cpumask\n"); + } +err: + close(nl_sd); + if (fd) + close(fd); + if (cfd) + close(cfd); + return 0; +} From edc73c7261ca3ea79867437bb0b9dab0e232436c Mon Sep 17 00:00:00 2001 From: xu xin Date: Fri, 29 Apr 2022 14:38:03 -0700 Subject: [PATCH 37/65] kernel: make taskstats available from all net namespaces If getdelays runs in a non-init network namespace, it will fail in getting delayacct stats even if it has privilege of root user, which seems to be not very reasonable. We can simply reproduce this by executing commands: unshare -n getdelays -d -p I don't think net namespace should be an obstacle to the normal execution of getdelay function. So let's make it available from all net namespaces. Link: https://lkml.kernel.org/r/20220412071946.2532318-1-xu.xin16@zte.com.cn Signed-off-by: xu xin Cc: Balbir Singh Cc: Yang Yang Cc: "Dr. Thomas Orgis" Cc: Eric W. Biederman Cc: Ismael Luceno Signed-off-by: Andrew Morton --- kernel/taskstats.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 72415e22342b..f7e246336218 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -688,6 +688,7 @@ static struct genl_family family __ro_after_init = { .module = THIS_MODULE, .ops = taskstats_ops, .n_ops = ARRAY_SIZE(taskstats_ops), + .netnsok = true, }; /* Needed early in initialization */ From f6e2c20ca7604e6a267c93a511d19dda72573be1 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Fri, 29 Apr 2022 14:38:04 -0700 Subject: [PATCH 38/65] fs: sysv: check sbi->s_firstdatazone in complete_read_super sbi->s_firstinodezone is initialized to 2 and sbi->s_firstdatazone is read from sbd. There's no guarantee that sbi->s_firstdatazone must bigger than sbi->s_firstinodezone. If sbi->s_firstdatazone less than 2, the filesystem can still be mounted unexpetly. At this point, sbi->s_ninodes flip to very large value and this filesystem is broken. We can observe this by executing 'df' command. When we execute, we will get an error message: "sysv_count_free_inodes: unable to read inode table" Link: https://lkml.kernel.org/r/20220330104215.530223-1-liushixin2@huawei.com Signed-off-by: Liu Shixin Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- fs/sysv/super.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/sysv/super.c b/fs/sysv/super.c index d1def0771a40..3365a30dc1e0 100644 --- a/fs/sysv/super.c +++ b/fs/sysv/super.c @@ -312,7 +312,9 @@ static int complete_read_super(struct super_block *sb, int silent, int size) sbi->s_firstinodezone = 2; flavour_setup[sbi->s_type](sbi, &sb->s_max_links); - + if (sbi->s_firstdatazone < sbi->s_firstinodezone) + return 0; + sbi->s_ndatazones = sbi->s_nzones - sbi->s_firstdatazone; sbi->s_inodes_per_block = bsize >> 6; sbi->s_inodes_per_block_1 = (bsize >> 6)-1; From 7055197705709c59b8ab77e6a5c7d46d61edd96e Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 9 May 2022 18:29:19 -0700 Subject: [PATCH 39/65] proc: fix dentry/inode overinstantiating under /proc/${pid}/net When a process exits, /proc/${pid}, and /proc/${pid}/net dentries are flushed. However some leaf dentries like /proc/${pid}/net/arp_cache aren't. That's because respective PDEs have proc_misc_d_revalidate() hook which returns 1 and leaves dentries/inodes in the LRU. Force revalidation/lookup on everything under /proc/${pid}/net by inheriting proc_net_dentry_ops. [akpm@linux-foundation.org: coding-style cleanups] Link: https://lkml.kernel.org/r/YjdVHgildbWO7diJ@localhost.localdomain Fixes: c6c75deda813 ("proc: fix lookup in /proc/net subdirectories after setns(2)") Signed-off-by: Alexey Dobriyan Reported-by: hui li Cc: Al Viro Signed-off-by: Andrew Morton --- fs/proc/generic.c | 3 +++ fs/proc/proc_net.c | 3 +++ 2 files changed, 6 insertions(+) diff --git a/fs/proc/generic.c b/fs/proc/generic.c index f2132407e133..587b91d9d998 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -448,6 +448,9 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, proc_set_user(ent, (*parent)->uid, (*parent)->gid); ent->proc_dops = &proc_misc_dentry_ops; + /* Revalidate everything under /proc/${pid}/net */ + if ((*parent)->proc_dops == &proc_net_dentry_ops) + pde_force_lookup(ent); out: return ent; diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index e1cfeda397f3..913e5acefbb6 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -376,6 +376,9 @@ static __net_init int proc_net_ns_init(struct net *net) proc_set_user(netd, uid, gid); + /* Seed dentry revalidation for /proc/${pid}/net */ + pde_force_lookup(netd); + err = -EEXIST; net_statd = proc_net_mkdir(net, "stat", netd); if (!net_statd) From da028e4c4b0279eb49f80220d8f7cc62b4a57ccb Mon Sep 17 00:00:00 2001 From: David Disseldorp Date: Mon, 9 May 2022 18:29:19 -0700 Subject: [PATCH 40/65] initramfs: refactor do_header() cpio magic checks Patch series "initramfs: "crc" cpio format and INITRAMFS_PRESERVE_MTIME", v7. This patchset does some minor initramfs refactoring and allows cpio entry mtime preservation to be disabled via a new Kconfig INITRAMFS_PRESERVE_MTIME option. Patches 4/6 to 6/6 implement support for creation and extraction of "crc" cpio archives, which carry file data checksums. Basic tests for this functionality can be found at https://github.com/rapido-linux/rapido/pull/163 This patch (of 6): do_header() is called for each cpio entry and fails if the first six bytes don't match "newc" magic. The magic check includes a special case error message if POSIX.1 ASCII (cpio -H odc) magic is detected. This special case POSIX.1 check can be nested under the "newc" mismatch code path to avoid calling memcmp() twice in a non-error case. Link: https://lkml.kernel.org/r/20220404093429.27570-1-ddiss@suse.de Link: https://lkml.kernel.org/r/20220404093429.27570-2-ddiss@suse.de Signed-off-by: David Disseldorp Reviewed-by: Martin Wilck Acked-by: Christian Brauner Cc: Al Viro Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- init/initramfs.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/init/initramfs.c b/init/initramfs.c index 2f3d96dc3db6..2f79b3ec0b40 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -257,12 +257,11 @@ static int __init do_collect(void) static int __init do_header(void) { - if (memcmp(collected, "070707", 6)==0) { - error("incorrect cpio method used: use -H newc option"); - return 1; - } if (memcmp(collected, "070701", 6)) { - error("no cpio magic"); + if (memcmp(collected, "070707", 6) == 0) + error("incorrect cpio method used: use -H newc option"); + else + error("no cpio magic"); return 1; } parse_header(collected); From fcb7aedd2e90c4ad43f7f01827014df8c6f034a5 Mon Sep 17 00:00:00 2001 From: David Disseldorp Date: Mon, 9 May 2022 18:29:19 -0700 Subject: [PATCH 41/65] initramfs: make dir_entry.name a flexible array member dir_entry.name is currently allocated via a separate kstrdup(). Change it to a flexible array member and allocate it along with struct dir_entry. Link: https://lkml.kernel.org/r/20220404093429.27570-3-ddiss@suse.de Signed-off-by: David Disseldorp Acked-by: Christian Brauner Cc: Al Viro Cc: Martin Wilck Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- init/initramfs.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/init/initramfs.c b/init/initramfs.c index 2f79b3ec0b40..656d2d71349f 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -130,17 +130,20 @@ static long __init do_utime(char *filename, time64_t mtime) static __initdata LIST_HEAD(dir_list); struct dir_entry { struct list_head list; - char *name; time64_t mtime; + char name[]; }; static void __init dir_add(const char *name, time64_t mtime) { - struct dir_entry *de = kmalloc(sizeof(struct dir_entry), GFP_KERNEL); + size_t nlen = strlen(name) + 1; + struct dir_entry *de; + + de = kmalloc(sizeof(struct dir_entry) + nlen, GFP_KERNEL); if (!de) panic_show_mem("can't allocate dir_entry buffer"); INIT_LIST_HEAD(&de->list); - de->name = kstrdup(name, GFP_KERNEL); + strscpy(de->name, name, nlen); de->mtime = mtime; list_add(&de->list, &dir_list); } @@ -151,7 +154,6 @@ static void __init dir_utime(void) list_for_each_entry_safe(de, tmp, &dir_list, list) { list_del(&de->list); do_utime(de->name, de->mtime); - kfree(de->name); kfree(de); } } From 1274aea127b2e8c9a4b9cbcc3ea6baf78990a958 Mon Sep 17 00:00:00 2001 From: David Disseldorp Date: Mon, 9 May 2022 18:29:19 -0700 Subject: [PATCH 42/65] initramfs: add INITRAMFS_PRESERVE_MTIME Kconfig option initramfs cpio mtime preservation, as implemented in commit 889d51a10712 ("initramfs: add option to preserve mtime from initramfs cpio images"), uses a linked list to defer directory mtime processing until after all other items in the cpio archive have been processed. This is done to ensure that parent directory mtimes aren't overwritten via subsequent child creation. The lkml link below indicates that the mtime retention use case was for embedded devices with applications running exclusively out of initramfs, where the 32-bit mtime value provided a rough file version identifier. Linux distributions which discard an extracted initramfs immediately after the root filesystem has been mounted may want to avoid the unnecessary overhead. This change adds a new INITRAMFS_PRESERVE_MTIME Kconfig option, which can be used to disable on-by-default mtime retention and in turn speed up initramfs extraction, particularly for cpio archives with large directory counts. Benchmarks with a one million directory cpio archive extracted 20 times demonstrated: mean extraction time (s) std dev INITRAMFS_PRESERVE_MTIME=y 3.808 0.006 INITRAMFS_PRESERVE_MTIME unset 3.056 0.004 The above extraction times were measured using ftrace (initcall_finish - initcall_start) values for populate_rootfs() with initramfs_async disabled. [ddiss@suse.de: rebase atop dir_entry.name flexible array member and drop separate initramfs_mtime.h header] Link: https://lkml.org/lkml/2008/9/3/424 Link: https://lkml.kernel.org/r/20220404093429.27570-4-ddiss@suse.de Signed-off-by: David Disseldorp Reviewed-by: Martin Wilck Cc: Al Viro Cc: Christian Brauner Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- init/Kconfig | 10 ++++++++++ init/initramfs.c | 28 ++++++++++++++++------------ 2 files changed, 26 insertions(+), 12 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index 5cddb9ba0eef..90cb1ac936db 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1361,6 +1361,16 @@ config BOOT_CONFIG If unsure, say Y. +config INITRAMFS_PRESERVE_MTIME + bool "Preserve cpio archive mtimes in initramfs" + default y + help + Each entry in an initramfs cpio archive carries an mtime value. When + enabled, extracted cpio items take this mtime, with directory mtime + setting deferred until after creation of any child entries. + + If unsure, say Y. + choice prompt "Compiler optimization level" default CC_OPTIMIZE_FOR_PERFORMANCE diff --git a/init/initramfs.c b/init/initramfs.c index 656d2d71349f..b5bfed859fa9 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -116,15 +116,17 @@ static void __init free_hash(void) } } -static long __init do_utime(char *filename, time64_t mtime) +#ifdef CONFIG_INITRAMFS_PRESERVE_MTIME +static void __init do_utime(char *filename, time64_t mtime) { - struct timespec64 t[2]; + struct timespec64 t[2] = { { .tv_sec = mtime }, { .tv_sec = mtime } }; + init_utimes(filename, t); +} - t[0].tv_sec = mtime; - t[0].tv_nsec = 0; - t[1].tv_sec = mtime; - t[1].tv_nsec = 0; - return init_utimes(filename, t); +static void __init do_utime_path(const struct path *path, time64_t mtime) +{ + struct timespec64 t[2] = { { .tv_sec = mtime }, { .tv_sec = mtime } }; + vfs_utimes(path, t); } static __initdata LIST_HEAD(dir_list); @@ -157,6 +159,12 @@ static void __init dir_utime(void) kfree(de); } } +#else +static void __init do_utime(char *filename, time64_t mtime) {} +static void __init do_utime_path(const struct path *path, time64_t mtime) {} +static void __init dir_add(const char *name, time64_t mtime) {} +static void __init dir_utime(void) {} +#endif static __initdata time64_t mtime; @@ -381,14 +389,10 @@ static int __init do_name(void) static int __init do_copy(void) { if (byte_count >= body_len) { - struct timespec64 t[2] = { }; if (xwrite(wfile, victim, body_len, &wfile_pos) != body_len) error("write error"); - t[0].tv_sec = mtime; - t[1].tv_sec = mtime; - vfs_utimes(&wfile->f_path, t); - + do_utime_path(&wfile->f_path, mtime); fput(wfile); eat(body_len); state = SkipIt; From 3a2699cfbe317f6e1b9c84d2f10ab7debb1c79dc Mon Sep 17 00:00:00 2001 From: David Disseldorp Date: Mon, 9 May 2022 18:29:20 -0700 Subject: [PATCH 43/65] gen_init_cpio: fix short read file handling When processing a "file" entry, gen_init_cpio attempts to allocate a buffer large enough to stage the entire contents of the source file. It then attempts to fill the buffer via a single read() call and subsequently writes out the entire buffer length, without checking that read() returned the full length, potentially writing uninitialized buffer memory. Fix this by breaking up file I/O into 64k chunks and only writing the length returned by the prior read() call. Link: https://lkml.kernel.org/r/20220404093429.27570-5-ddiss@suse.de Signed-off-by: David Disseldorp Reviewed-by: Martin Wilck Cc: Al Viro Cc: Christian Brauner Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- usr/gen_init_cpio.c | 44 +++++++++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/usr/gen_init_cpio.c b/usr/gen_init_cpio.c index 0e2c8a5838b1..9a0f8c37273a 100644 --- a/usr/gen_init_cpio.c +++ b/usr/gen_init_cpio.c @@ -20,6 +20,7 @@ #define xstr(s) #s #define str(s) xstr(s) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) static unsigned int offset; static unsigned int ino = 721; @@ -297,9 +298,8 @@ static int cpio_mkfile(const char *name, const char *location, unsigned int nlinks) { char s[256]; - char *filebuf = NULL; struct stat buf; - long size; + unsigned long size; int file = -1; int retval; int rc = -1; @@ -326,22 +326,17 @@ static int cpio_mkfile(const char *name, const char *location, buf.st_mtime = 0xffffffff; } - filebuf = malloc(buf.st_size); - if (!filebuf) { - fprintf (stderr, "out of memory\n"); - goto error; - } - - retval = read (file, filebuf, buf.st_size); - if (retval < 0) { - fprintf (stderr, "Can not read %s file\n", location); + if (buf.st_size > 0xffffffff) { + fprintf(stderr, "%s: Size exceeds maximum cpio file size\n", + location); goto error; } size = 0; for (i = 1; i <= nlinks; i++) { /* data goes on last link */ - if (i == nlinks) size = buf.st_size; + if (i == nlinks) + size = buf.st_size; if (name[0] == '/') name++; @@ -366,23 +361,34 @@ static int cpio_mkfile(const char *name, const char *location, push_string(name); push_pad(); - if (size) { - if (fwrite(filebuf, size, 1, stdout) != 1) { + while (size) { + unsigned char filebuf[65536]; + ssize_t this_read; + size_t this_size = MIN(size, sizeof(filebuf)); + + this_read = read(file, filebuf, this_size); + if (this_read <= 0 || this_read > this_size) { + fprintf(stderr, "Can not read %s file\n", location); + goto error; + } + + if (fwrite(filebuf, this_read, 1, stdout) != 1) { fprintf(stderr, "writing filebuf failed\n"); goto error; } - offset += size; - push_pad(); + offset += this_read; + size -= this_read; } + push_pad(); name += namesize; } ino++; rc = 0; - + error: - if (filebuf) free(filebuf); - if (file >= 0) close(file); + if (file >= 0) + close(file); return rc; } From ea8048719a0c46d95e6ab925bf0924e7198d9971 Mon Sep 17 00:00:00 2001 From: David Disseldorp Date: Mon, 9 May 2022 18:29:20 -0700 Subject: [PATCH 44/65] gen_init_cpio: support file checksum archiving Documentation/driver-api/early-userspace/buffer-format.rst includes the specification for checksum-enabled cpio archives. Implement support for this format in gen_init_cpio via a new '-c' parameter. Link: https://lkml.kernel.org/r/20220404093429.27570-6-ddiss@suse.de Signed-off-by: David Disseldorp Suggested-by: Matthew Wilcox (Oracle) Cc: Al Viro Cc: Christian Brauner Cc: Martin Wilck Signed-off-by: Andrew Morton --- usr/gen_init_cpio.c | 54 +++++++++++++++++++++++++++++++++++++-------- 1 file changed, 45 insertions(+), 9 deletions(-) diff --git a/usr/gen_init_cpio.c b/usr/gen_init_cpio.c index 9a0f8c37273a..dc838e26a5b9 100644 --- a/usr/gen_init_cpio.c +++ b/usr/gen_init_cpio.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 #include #include +#include +#include #include #include #include @@ -25,6 +27,7 @@ static unsigned int offset; static unsigned int ino = 721; static time_t default_mtime; +static bool do_csum = false; struct file_handler { const char *type; @@ -78,7 +81,7 @@ static void cpio_trailer(void) sprintf(s, "%s%08X%08X%08lX%08lX%08X%08lX" "%08X%08X%08X%08X%08X%08X%08X", - "070701", /* magic */ + do_csum ? "070702" : "070701", /* magic */ 0, /* ino */ 0, /* mode */ (long) 0, /* uid */ @@ -110,7 +113,7 @@ static int cpio_mkslink(const char *name, const char *target, name++; sprintf(s,"%s%08X%08X%08lX%08lX%08X%08lX" "%08X%08X%08X%08X%08X%08X%08X", - "070701", /* magic */ + do_csum ? "070702" : "070701", /* magic */ ino++, /* ino */ S_IFLNK | mode, /* mode */ (long) uid, /* uid */ @@ -159,7 +162,7 @@ static int cpio_mkgeneric(const char *name, unsigned int mode, name++; sprintf(s,"%s%08X%08X%08lX%08lX%08X%08lX" "%08X%08X%08X%08X%08X%08X%08X", - "070701", /* magic */ + do_csum ? "070702" : "070701", /* magic */ ino++, /* ino */ mode, /* mode */ (long) uid, /* uid */ @@ -253,7 +256,7 @@ static int cpio_mknod(const char *name, unsigned int mode, name++; sprintf(s,"%s%08X%08X%08lX%08lX%08X%08lX" "%08X%08X%08X%08X%08X%08X%08X", - "070701", /* magic */ + do_csum ? "070702" : "070701", /* magic */ ino++, /* ino */ mode, /* mode */ (long) uid, /* uid */ @@ -293,6 +296,29 @@ static int cpio_mknod_line(const char *line) return rc; } +static int cpio_mkfile_csum(int fd, unsigned long size, uint32_t *csum) +{ + while (size) { + unsigned char filebuf[65536]; + ssize_t this_read; + size_t i, this_size = MIN(size, sizeof(filebuf)); + + this_read = read(fd, filebuf, this_size); + if (this_read <= 0 || this_read > this_size) + return -1; + + for (i = 0; i < this_read; i++) + *csum += filebuf[i]; + + size -= this_read; + } + /* seek back to the start for data segment I/O */ + if (lseek(fd, 0, SEEK_SET) < 0) + return -1; + + return 0; +} + static int cpio_mkfile(const char *name, const char *location, unsigned int mode, uid_t uid, gid_t gid, unsigned int nlinks) @@ -305,6 +331,7 @@ static int cpio_mkfile(const char *name, const char *location, int rc = -1; int namesize; unsigned int i; + uint32_t csum = 0; mode |= S_IFREG; @@ -332,6 +359,11 @@ static int cpio_mkfile(const char *name, const char *location, goto error; } + if (do_csum && cpio_mkfile_csum(file, buf.st_size, &csum) < 0) { + fprintf(stderr, "Failed to checksum file %s\n", location); + goto error; + } + size = 0; for (i = 1; i <= nlinks; i++) { /* data goes on last link */ @@ -343,7 +375,7 @@ static int cpio_mkfile(const char *name, const char *location, namesize = strlen(name) + 1; sprintf(s,"%s%08X%08X%08lX%08lX%08X%08lX" "%08lX%08X%08X%08X%08X%08X%08X", - "070701", /* magic */ + do_csum ? "070702" : "070701", /* magic */ ino, /* ino */ mode, /* mode */ (long) uid, /* uid */ @@ -356,7 +388,7 @@ static int cpio_mkfile(const char *name, const char *location, 0, /* rmajor */ 0, /* rminor */ namesize, /* namesize */ - 0); /* chksum */ + size ? csum : 0); /* chksum */ push_hdr(s); push_string(name); push_pad(); @@ -464,7 +496,7 @@ static int cpio_mkfile_line(const char *line) static void usage(const char *prog) { fprintf(stderr, "Usage:\n" - "\t%s [-t ] \n" + "\t%s [-t ] [-c] \n" "\n" " is a file containing newline separated entries that\n" "describe the files to be included in the initramfs archive:\n" @@ -499,7 +531,8 @@ static void usage(const char *prog) "\n" " is time in seconds since Epoch that will be used\n" "as mtime for symlinks, special files and directories. The default\n" - "is to use the current time for these entries.\n", + "is to use the current time for these entries.\n" + "-c: calculate and store 32-bit checksums for file data.\n", prog); } @@ -541,7 +574,7 @@ int main (int argc, char *argv[]) default_mtime = time(NULL); while (1) { - int opt = getopt(argc, argv, "t:h"); + int opt = getopt(argc, argv, "t:ch"); char *invalid; if (opt == -1) @@ -556,6 +589,9 @@ int main (int argc, char *argv[]) exit(1); } break; + case 'c': + do_csum = true; + break; case 'h': case '?': usage(argv[0]); From 800c24dc34b93d2014f3952683f8d5e9309e1b73 Mon Sep 17 00:00:00 2001 From: David Disseldorp Date: Mon, 9 May 2022 18:29:20 -0700 Subject: [PATCH 45/65] initramfs: support cpio extraction with file checksums Add support for extraction of checksum-enabled "070702" cpio archives, specified in Documentation/driver-api/early-userspace/buffer-format.rst. Fail extraction if the calculated file data checksum doesn't match the value carried in the header. Link: https://lkml.kernel.org/r/20220404093429.27570-7-ddiss@suse.de Signed-off-by: David Disseldorp Suggested-by: Matthew Wilcox (Oracle) Cc: Al Viro Cc: Christian Brauner Cc: Martin Wilck Signed-off-by: Andrew Morton --- init/initramfs.c | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/init/initramfs.c b/init/initramfs.c index b5bfed859fa9..dc84cf756cea 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -17,8 +17,11 @@ #include #include -static ssize_t __init xwrite(struct file *file, const char *p, size_t count, - loff_t *pos) +static __initdata bool csum_present; +static __initdata u32 io_csum; + +static ssize_t __init xwrite(struct file *file, const unsigned char *p, + size_t count, loff_t *pos) { ssize_t out = 0; @@ -33,6 +36,13 @@ static ssize_t __init xwrite(struct file *file, const char *p, size_t count, } else if (rv == 0) break; + if (csum_present) { + ssize_t i; + + for (i = 0; i < rv; i++) + io_csum += p[i]; + } + p += rv; out += rv; count -= rv; @@ -176,15 +186,16 @@ static __initdata unsigned long body_len, name_len; static __initdata uid_t uid; static __initdata gid_t gid; static __initdata unsigned rdev; +static __initdata u32 hdr_csum; static void __init parse_header(char *s) { - unsigned long parsed[12]; + unsigned long parsed[13]; char buf[9]; int i; buf[8] = '\0'; - for (i = 0, s += 6; i < 12; i++, s += 8) { + for (i = 0, s += 6; i < 13; i++, s += 8) { memcpy(buf, s, 8); parsed[i] = simple_strtoul(buf, NULL, 16); } @@ -199,6 +210,7 @@ static void __init parse_header(char *s) minor = parsed[8]; rdev = new_encode_dev(MKDEV(parsed[9], parsed[10])); name_len = parsed[11]; + hdr_csum = parsed[12]; } /* FSM */ @@ -267,7 +279,11 @@ static int __init do_collect(void) static int __init do_header(void) { - if (memcmp(collected, "070701", 6)) { + if (!memcmp(collected, "070701", 6)) { + csum_present = false; + } else if (!memcmp(collected, "070702", 6)) { + csum_present = true; + } else { if (memcmp(collected, "070707", 6) == 0) error("incorrect cpio method used: use -H newc option"); else @@ -362,6 +378,7 @@ static int __init do_name(void) if (IS_ERR(wfile)) return 0; wfile_pos = 0; + io_csum = 0; vfs_fchown(wfile, uid, gid); vfs_fchmod(wfile, mode); @@ -394,6 +411,8 @@ static int __init do_copy(void) do_utime_path(&wfile->f_path, mtime); fput(wfile); + if (csum_present && io_csum != hdr_csum) + error("bad data checksum"); eat(body_len); state = SkipIt; return 0; From 0e900029655327bb5326ced02eff97667a079039 Mon Sep 17 00:00:00 2001 From: Michal Orzel Date: Mon, 9 May 2022 18:29:20 -0700 Subject: [PATCH 46/65] ipc/sem: remove redundant assignments Get rid of redundant assignments which end up in values not being read either because they are overwritten or the function ends. Reported by clang-tidy [deadcode.DeadStores] Link: https://lkml.kernel.org/r/20220409101933.207157-1-michalorzel.eng@gmail.com Signed-off-by: Michal Orzel Reviewed-by: Tom Rix Reviewed-by: Nathan Chancellor Cc: Nick Desaulniers Signed-off-by: Andrew Morton --- ipc/sem.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/ipc/sem.c b/ipc/sem.c index 0dbdb98fdf2d..38ef91a63edd 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -766,7 +766,6 @@ static int perform_atomic_semop(struct sem_array *sma, struct sem_queue *q) for (sop = sops; sop < sops + nsops; sop++) { curr = &sma->sems[sop->sem_num]; sem_op = sop->sem_op; - result = curr->semval; if (sop->sem_flg & SEM_UNDO) { int undo = un->semadj[sop->sem_num] - sem_op; @@ -1430,7 +1429,6 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, if (err) goto out_rcu_wakeup; - err = -EACCES; switch (cmd) { case GETALL: { From 49c9dd0df65d547a58642d2f717eeb560e1db140 Mon Sep 17 00:00:00 2001 From: Prakash Sangappa Date: Mon, 9 May 2022 18:29:20 -0700 Subject: [PATCH 47/65] ipc: update semtimedop() to use hrtimer semtimedop() should be converted to use hrtimer like it has been done for most of the system calls with timeouts. This system call already takes a struct timespec as an argument and can therefore provide finer granularity timed wait. Link: https://lkml.kernel.org/r/1651187881-2858-1-git-send-email-prakash.sangappa@oracle.com Signed-off-by: Prakash Sangappa Reviewed-by: Thomas Gleixner Reviewed-by: Davidlohr Bueso Reviewed-by: Manfred Spraul Signed-off-by: Andrew Morton --- ipc/sem.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/ipc/sem.c b/ipc/sem.c index 38ef91a63edd..c8496f98b139 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -1993,7 +1993,9 @@ long __do_semtimedop(int semid, struct sembuf *sops, int max, locknum; bool undos = false, alter = false, dupsop = false; struct sem_queue queue; - unsigned long dup = 0, jiffies_left = 0; + unsigned long dup = 0; + ktime_t expires, *exp = NULL; + bool timed_out = false; if (nsops < 1 || semid < 0) return -EINVAL; @@ -2001,12 +2003,11 @@ long __do_semtimedop(int semid, struct sembuf *sops, return -E2BIG; if (timeout) { - if (timeout->tv_sec < 0 || timeout->tv_nsec < 0 || - timeout->tv_nsec >= 1000000000L) { - error = -EINVAL; - goto out; - } - jiffies_left = timespec64_to_jiffies(timeout); + if (!timespec64_valid(timeout)) + return -EINVAL; + expires = ktime_add_safe(ktime_get(), + timespec64_to_ktime(*timeout)); + exp = &expires; } @@ -2164,10 +2165,8 @@ long __do_semtimedop(int semid, struct sembuf *sops, sem_unlock(sma, locknum); rcu_read_unlock(); - if (timeout) - jiffies_left = schedule_timeout(jiffies_left); - else - schedule(); + timed_out = !schedule_hrtimeout_range(exp, + current->timer_slack_ns, HRTIMER_MODE_ABS); /* * fastpath: the semop has completed, either successfully or @@ -2208,7 +2207,7 @@ long __do_semtimedop(int semid, struct sembuf *sops, /* * If an interrupt occurred we have to clean up the queue. */ - if (timeout && jiffies_left == 0) + if (timed_out) error = -EAGAIN; } while (error == -EINTR && !signal_pending(current)); /* spurious */ From d60c4d01a98bc1942dba6e3adc02031f5519f94b Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 9 May 2022 18:29:21 -0700 Subject: [PATCH 48/65] ipc/mqueue: use get_tree_nodev() in mqueue_get_tree() When running the stress-ng clone benchmark with multiple testing threads, it was found that there were significant spinlock contention in sget_fc(). The contended spinlock was the sb_lock. It is under heavy contention because the following code in the critcal section of sget_fc(): hlist_for_each_entry(old, &fc->fs_type->fs_supers, s_instances) { if (test(old, fc)) goto share_extant_sb; } After testing with added instrumentation code, it was found that the benchmark could generate thousands of ipc namespaces with the corresponding number of entries in the mqueue's fs_supers list where the namespaces are the key for the search. This leads to excessive time in scanning the list for a match. Looking back at the mqueue calling sequence leading to sget_fc(): mq_init_ns() => mq_create_mount() => fc_mount() => vfs_get_tree() => mqueue_get_tree() => get_tree_keyed() => vfs_get_super() => sget_fc() Currently, mq_init_ns() is the only mqueue function that will indirectly call mqueue_get_tree() with a newly allocated ipc namespace as the key for searching. As a result, there will never be a match with the exising ipc namespaces stored in the mqueue's fs_supers list. So using get_tree_keyed() to do an existing ipc namespace search is just a waste of time. Instead, we could use get_tree_nodev() to eliminate the useless search. By doing so, we can greatly reduce the sb_lock hold time and avoid the spinlock contention problem in case a large number of ipc namespaces are present. Of course, if the code is modified in the future to allow mqueue_get_tree() to be called with an existing ipc namespace instead of a new one, we will have to use get_tree_keyed() in this case. The following stress-ng clone benchmark command was run on a 2-socket 48-core Intel system: ./stress-ng --clone 32 --verbose --oomable --metrics-brief -t 20 The "bogo ops/s" increased from 5948.45 before patch to 9137.06 after patch. This is an increase of 54% in performance. Link: https://lkml.kernel.org/r/20220121172315.19652-1-longman@redhat.com Fixes: 935c6912b198 ("ipc: Convert mqueue fs to fs_context") Signed-off-by: Waiman Long Cc: Al Viro Cc: David Howells Cc: Manfred Spraul Cc: Davidlohr Bueso Signed-off-by: Andrew Morton --- ipc/mqueue.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 7c08eb3c258d..54cb6264f8cf 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -45,6 +45,7 @@ struct mqueue_fs_context { struct ipc_namespace *ipc_ns; + bool newns; /* Set if newly created ipc namespace */ }; #define MQUEUE_MAGIC 0x19800202 @@ -427,6 +428,14 @@ static int mqueue_get_tree(struct fs_context *fc) { struct mqueue_fs_context *ctx = fc->fs_private; + /* + * With a newly created ipc namespace, we don't need to do a search + * for an ipc namespace match, but we still need to set s_fs_info. + */ + if (ctx->newns) { + fc->s_fs_info = ctx->ipc_ns; + return get_tree_nodev(fc, mqueue_fill_super); + } return get_tree_keyed(fc, mqueue_fill_super, ctx->ipc_ns); } @@ -454,6 +463,10 @@ static int mqueue_init_fs_context(struct fs_context *fc) return 0; } +/* + * mq_init_ns() is currently the only caller of mq_create_mount(). + * So the ns parameter is always a newly created ipc namespace. + */ static struct vfsmount *mq_create_mount(struct ipc_namespace *ns) { struct mqueue_fs_context *ctx; @@ -465,6 +478,7 @@ static struct vfsmount *mq_create_mount(struct ipc_namespace *ns) return ERR_CAST(fc); ctx = fc->fs_private; + ctx->newns = true; put_ipc_ns(ctx->ipc_ns); ctx->ipc_ns = get_ipc_ns(ns); put_user_ns(fc->user_ns); From c9b516f16be5896a3d798f8efb03acbd2ceec715 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 12 May 2022 20:38:36 -0700 Subject: [PATCH 49/65] ELF, uapi: fixup ELF_ST_TYPE definition This is very theoretical compile failure: ELF_ST_TYPE(st_info = A) Cast will bind first and st_info will stop being lvalue: error: lvalue required as left operand of assignment Given that the only use of this macro is ELF_ST_TYPE(sym->st_info) where st_info is "unsigned char" I've decided to remove cast especially given that companion macro ELF_ST_BIND doesn't use cast. Link: https://lkml.kernel.org/r/Ymv7G1BeX4kt3obz@localhost.localdomain Signed-off-by: Alexey Dobriyan Acked-by: Kees Cook Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton --- include/uapi/linux/elf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h index 787c657bfae8..237f21a5e0f6 100644 --- a/include/uapi/linux/elf.h +++ b/include/uapi/linux/elf.h @@ -134,7 +134,7 @@ typedef __s64 Elf64_Sxword; #define STT_TLS 6 #define ELF_ST_BIND(x) ((x) >> 4) -#define ELF_ST_TYPE(x) (((unsigned int) x) & 0xf) +#define ELF_ST_TYPE(x) ((x) & 0xf) #define ELF32_ST_BIND(x) ELF_ST_BIND(x) #define ELF32_ST_TYPE(x) ELF_ST_TYPE(x) #define ELF64_ST_BIND(x) ELF_ST_BIND(x) From a7bd57b87f65e0e1c5d41baf51a0d0b49fb30808 Mon Sep 17 00:00:00 2001 From: lizhe Date: Thu, 12 May 2022 20:38:36 -0700 Subject: [PATCH 50/65] kernel/crash_core.c: remove redundant check of ck_cmdline At the end of get_last_crashkernel(), the judgement of ck_cmdline is obviously unnecessary and causes redundance, let's clean it up. Link: https://lkml.kernel.org/r/20220506104116.259323-1-sensor1010@163.com Signed-off-by: lizhe Acked-by: Baoquan He Acked-by: Philipp Rudo Cc: Vivek Goyal Cc: Dave Young Signed-off-by: Andrew Morton --- kernel/crash_core.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 256cf6db573c..c232f01a2c54 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -222,9 +222,6 @@ next: p = strstr(p+1, name); } - if (!ck_cmdline) - return NULL; - return ck_cmdline; } From cd290a9839cee2f6641558877e707bd373c8f6f1 Mon Sep 17 00:00:00 2001 From: Puyou Lu Date: Thu, 12 May 2022 20:38:36 -0700 Subject: [PATCH 51/65] lib/string_helpers: fix not adding strarray to device's resource list Add allocated strarray to device's resource list. This is a must to automatically release strarray when the device disappears. Without this fix we have a memory leak in the few drivers which use devm_kasprintf_strarray(). Link: https://lkml.kernel.org/r/20220506044409.30066-1-puyou.lu@gmail.com Link: https://lkml.kernel.org/r/20220506073623.2679-1-puyou.lu@gmail.com Fixes: acdb89b6c87a ("lib/string_helpers: Introduce managed variant of kasprintf_strarray()") Signed-off-by: Puyou Lu Reviewed-by: Andy Shevchenko Reviewed-by: Linus Walleij Cc: Tejun Heo Cc: Signed-off-by: Andrew Morton --- lib/string_helpers.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/string_helpers.c b/lib/string_helpers.c index 4f877e9551d5..5ed3beb066e6 100644 --- a/lib/string_helpers.c +++ b/lib/string_helpers.c @@ -757,6 +757,9 @@ char **devm_kasprintf_strarray(struct device *dev, const char *prefix, size_t n) return ERR_PTR(-ENOMEM); } + ptr->n = n; + devres_add(dev, ptr); + return ptr->array; } EXPORT_SYMBOL_GPL(devm_kasprintf_strarray); From a3b774342fa752a5290c0de36375289dfcf4a260 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 12 May 2022 20:38:37 -0700 Subject: [PATCH 52/65] fs/ntfs3: validate BOOT sectors_per_clusters When the NTFS BOOT sectors_per_clusters field is > 0x80, it represents a shift value. Make sure that the shift value is not too large before using it (NTFS max cluster size is 2MB). Return -EVINVAL if it too large. This prevents negative shift values and shift values that are larger than the field size. Prevents this UBSAN error: UBSAN: shift-out-of-bounds in ../fs/ntfs3/super.c:673:16 shift exponent -192 is negative Link: https://lkml.kernel.org/r/20220502175342.20296-1-rdunlap@infradead.org Fixes: 82cae269cfa9 ("fs/ntfs3: Add initialization of super block") Signed-off-by: Randy Dunlap Reported-by: syzbot+1631f09646bc214d2e76@syzkaller.appspotmail.com Reviewed-by: Namjae Jeon Cc: Konstantin Komarov Cc: Alexander Viro Cc: Kari Argillander Cc: Namjae Jeon Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- fs/ntfs3/super.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c index 278dcf502410..b2b54c4553f9 100644 --- a/fs/ntfs3/super.c +++ b/fs/ntfs3/super.c @@ -668,9 +668,11 @@ static u32 format_size_gb(const u64 bytes, u32 *mb) static u32 true_sectors_per_clst(const struct NTFS_BOOT *boot) { - return boot->sectors_per_clusters <= 0x80 - ? boot->sectors_per_clusters - : (1u << (0 - boot->sectors_per_clusters)); + if (boot->sectors_per_clusters <= 0x80) + return boot->sectors_per_clusters; + if (boot->sectors_per_clusters >= 0xf4) /* limit shift to 2MB max */ + return 1U << (0 - boot->sectors_per_clusters); + return -EINVAL; } /* @@ -713,6 +715,8 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, /* cluster size: 512, 1K, 2K, 4K, ... 2M */ sct_per_clst = true_sectors_per_clst(boot); + if ((int)sct_per_clst < 0) + goto out; if (!is_power_of_2(sct_per_clst)) goto out; From 47b7eae62aa7dc69f0e6d12493e5468ba57bf074 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 12 May 2022 20:38:37 -0700 Subject: [PATCH 53/65] relay: remove redundant assignment to pointer buf Pointer buf is being assigned a value that is not being read, buf is being re-assigned in the next starement. The assignment is redundant and can be removed. Cleans up clang scan build warning: kernel/relay.c:443:8: warning: Although the value stored to 'buf' is used in the enclosing expression, the value is never actually read from 'buf' [deadcode.DeadStores] Link: https://lkml.kernel.org/r/20220508212152.58753-1-colin.i.king@gmail.com Signed-off-by: Colin Ian King Reviewed-by: Jens Axboe Cc: Christoph Hellwig Cc: Kalle Valo Signed-off-by: Andrew Morton --- kernel/relay.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/relay.c b/kernel/relay.c index d1a67fbb819d..6a611e779e95 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -440,7 +440,7 @@ int relay_prepare_cpu(unsigned int cpu) mutex_lock(&relay_channels_mutex); list_for_each_entry(chan, &relay_channels, list) { - if ((buf = *per_cpu_ptr(chan->buf, cpu))) + if (*per_cpu_ptr(chan->buf, cpu)) continue; buf = relay_open_buf(chan, cpu); if (!buf) { From 6b9dbedbe3499fef862c4dff5217cf91f34e43b3 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 12 May 2022 20:38:37 -0700 Subject: [PATCH 54/65] tty: fix deadlock caused by calling printk() under tty_port->lock pty_write() invokes kmalloc() which may invoke a normal printk() to print failure message. This can cause a deadlock in the scenario reported by syz-bot below: CPU0 CPU1 CPU2 ---- ---- ---- lock(console_owner); lock(&port_lock_key); lock(&port->lock); lock(&port_lock_key); lock(&port->lock); lock(console_owner); As commit dbdda842fe96 ("printk: Add console owner and waiter logic to load balance console writes") said, such deadlock can be prevented by using printk_deferred() in kmalloc() (which is invoked in the section guarded by the port->lock). But there are too many printk() on the kmalloc() path, and kmalloc() can be called from anywhere, so changing printk() to printk_deferred() is too complicated and inelegant. Therefore, this patch chooses to specify __GFP_NOWARN to kmalloc(), so that printk() will not be called, and this deadlock problem can be avoided. Syzbot reported the following lockdep error: ====================================================== WARNING: possible circular locking dependency detected 5.4.143-00237-g08ccc19a-dirty #10 Not tainted ------------------------------------------------------ syz-executor.4/29420 is trying to acquire lock: ffffffff8aedb2a0 (console_owner){....}-{0:0}, at: console_trylock_spinning kernel/printk/printk.c:1752 [inline] ffffffff8aedb2a0 (console_owner){....}-{0:0}, at: vprintk_emit+0x2ca/0x470 kernel/printk/printk.c:2023 but task is already holding lock: ffff8880119c9158 (&port->lock){-.-.}-{2:2}, at: pty_write+0xf4/0x1f0 drivers/tty/pty.c:120 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (&port->lock){-.-.}-{2:2}: __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline] _raw_spin_lock_irqsave+0x35/0x50 kernel/locking/spinlock.c:159 tty_port_tty_get drivers/tty/tty_port.c:288 [inline] <-- lock(&port->lock); tty_port_default_wakeup+0x1d/0xb0 drivers/tty/tty_port.c:47 serial8250_tx_chars+0x530/0xa80 drivers/tty/serial/8250/8250_port.c:1767 serial8250_handle_irq.part.0+0x31f/0x3d0 drivers/tty/serial/8250/8250_port.c:1854 serial8250_handle_irq drivers/tty/serial/8250/8250_port.c:1827 [inline] <-- lock(&port_lock_key); serial8250_default_handle_irq+0xb2/0x220 drivers/tty/serial/8250/8250_port.c:1870 serial8250_interrupt+0xfd/0x200 drivers/tty/serial/8250/8250_core.c:126 __handle_irq_event_percpu+0x109/0xa50 kernel/irq/handle.c:156 [...] -> #1 (&port_lock_key){-.-.}-{2:2}: __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline] _raw_spin_lock_irqsave+0x35/0x50 kernel/locking/spinlock.c:159 serial8250_console_write+0x184/0xa40 drivers/tty/serial/8250/8250_port.c:3198 <-- lock(&port_lock_key); call_console_drivers kernel/printk/printk.c:1819 [inline] console_unlock+0x8cb/0xd00 kernel/printk/printk.c:2504 vprintk_emit+0x1b5/0x470 kernel/printk/printk.c:2024 <-- lock(console_owner); vprintk_func+0x8d/0x250 kernel/printk/printk_safe.c:394 printk+0xba/0xed kernel/printk/printk.c:2084 register_console+0x8b3/0xc10 kernel/printk/printk.c:2829 univ8250_console_init+0x3a/0x46 drivers/tty/serial/8250/8250_core.c:681 console_init+0x49d/0x6d3 kernel/printk/printk.c:2915 start_kernel+0x5e9/0x879 init/main.c:713 secondary_startup_64+0xa4/0xb0 arch/x86/kernel/head_64.S:241 -> #0 (console_owner){....}-{0:0}: [...] lock_acquire+0x127/0x340 kernel/locking/lockdep.c:4734 console_trylock_spinning kernel/printk/printk.c:1773 [inline] <-- lock(console_owner); vprintk_emit+0x307/0x470 kernel/printk/printk.c:2023 vprintk_func+0x8d/0x250 kernel/printk/printk_safe.c:394 printk+0xba/0xed kernel/printk/printk.c:2084 fail_dump lib/fault-inject.c:45 [inline] should_fail+0x67b/0x7c0 lib/fault-inject.c:144 __should_failslab+0x152/0x1c0 mm/failslab.c:33 should_failslab+0x5/0x10 mm/slab_common.c:1224 slab_pre_alloc_hook mm/slab.h:468 [inline] slab_alloc_node mm/slub.c:2723 [inline] slab_alloc mm/slub.c:2807 [inline] __kmalloc+0x72/0x300 mm/slub.c:3871 kmalloc include/linux/slab.h:582 [inline] tty_buffer_alloc+0x23f/0x2a0 drivers/tty/tty_buffer.c:175 __tty_buffer_request_room+0x156/0x2a0 drivers/tty/tty_buffer.c:273 tty_insert_flip_string_fixed_flag+0x93/0x250 drivers/tty/tty_buffer.c:318 tty_insert_flip_string include/linux/tty_flip.h:37 [inline] pty_write+0x126/0x1f0 drivers/tty/pty.c:122 <-- lock(&port->lock); n_tty_write+0xa7a/0xfc0 drivers/tty/n_tty.c:2356 do_tty_write drivers/tty/tty_io.c:961 [inline] tty_write+0x512/0x930 drivers/tty/tty_io.c:1045 __vfs_write+0x76/0x100 fs/read_write.c:494 [...] other info that might help us debug this: Chain exists of: console_owner --> &port_lock_key --> &port->lock Link: https://lkml.kernel.org/r/20220511061951.1114-2-zhengqi.arch@bytedance.com Link: https://lkml.kernel.org/r/20220510113809.80626-2-zhengqi.arch@bytedance.com Fixes: b6da31b2c07c ("tty: Fix data race in tty_insert_flip_string_fixed_flag") Signed-off-by: Qi Zheng Acked-by: Jiri Slaby Acked-by: Greg Kroah-Hartman Cc: Akinobu Mita Cc: Vlastimil Babka Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- drivers/tty/tty_buffer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/tty/tty_buffer.c b/drivers/tty/tty_buffer.c index 646510476c30..bfa431a8e690 100644 --- a/drivers/tty/tty_buffer.c +++ b/drivers/tty/tty_buffer.c @@ -175,7 +175,8 @@ static struct tty_buffer *tty_buffer_alloc(struct tty_port *port, size_t size) */ if (atomic_read(&port->buf.mem_used) > port->buf.mem_limit) return NULL; - p = kmalloc(sizeof(struct tty_buffer) + 2 * size, GFP_ATOMIC); + p = kmalloc(sizeof(struct tty_buffer) + 2 * size, + GFP_ATOMIC | __GFP_NOWARN); if (p == NULL) return NULL; From 25d9767831d3dcae8f9f278555ba9ed57b30bbce Mon Sep 17 00:00:00 2001 From: Haowen Bai Date: Thu, 12 May 2022 20:38:37 -0700 Subject: [PATCH 55/65] ia64: mca: drop redundant spinlock initialization mlogbuf_rlock has declared and initialized by DEFINE_SPINLOCK, so we don't need to spin_lock_init again, drop it. Link: https://lkml.kernel.org/r/1652176897-4754-1-git-send-email-baihaowen@meizu.com Signed-off-by: Haowen Bai Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton --- arch/ia64/kernel/mca.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c index e628a88607bb..c62a66710ad6 100644 --- a/arch/ia64/kernel/mca.c +++ b/arch/ia64/kernel/mca.c @@ -290,7 +290,6 @@ static void ia64_mlogbuf_finish(int wait) { BREAK_LOGLEVEL(console_loglevel); - spin_lock_init(&mlogbuf_rlock); ia64_mlogbuf_dump(); printk(KERN_EMERG "mlogbuf_finish: printing switched to urgent mode, " "MCA/INIT might be dodgy or fail.\n"); From c7031c144043c5b9a9b8827aaf44a67937559418 Mon Sep 17 00:00:00 2001 From: Julius Hemanth Pitti Date: Fri, 13 May 2022 16:58:15 -0700 Subject: [PATCH 56/65] proc/sysctl: make protected_* world readable protected_* files have 600 permissions which prevents non-superuser from reading them. Container like "AWS greengrass" refuse to launch unless protected_hardlinks and protected_symlinks are set. When containers like these run with "userns-remap" or "--user" mapping container's root to non-superuser on host, they fail to run due to denied read access to these files. As these protections are hardly a secret, and do not possess any security risk, making them world readable. Though above greengrass usecase needs read access to only protected_hardlinks and protected_symlinks files, setting all other protected_* files to 644 to keep consistency. Link: http://lkml.kernel.org/r/20200709235115.56954-1-jpitti@cisco.com Fixes: 800179c9b8a1 ("fs: add link restrictions") Signed-off-by: Julius Hemanth Pitti Acked-by: Kees Cook Acked-by: Luis Chamberlain Cc: Iurii Zaikin Cc: Ingo Molnar Cc: Al Viro Signed-off-by: Andrew Morton --- fs/namei.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 509657fdf4f5..5e05571bb941 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1031,7 +1031,7 @@ static struct ctl_table namei_sysctls[] = { .procname = "protected_symlinks", .data = &sysctl_protected_symlinks, .maxlen = sizeof(int), - .mode = 0600, + .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, @@ -1040,7 +1040,7 @@ static struct ctl_table namei_sysctls[] = { .procname = "protected_hardlinks", .data = &sysctl_protected_hardlinks, .maxlen = sizeof(int), - .mode = 0600, + .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, @@ -1049,7 +1049,7 @@ static struct ctl_table namei_sysctls[] = { .procname = "protected_fifos", .data = &sysctl_protected_fifos, .maxlen = sizeof(int), - .mode = 0600, + .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, @@ -1058,7 +1058,7 @@ static struct ctl_table namei_sysctls[] = { .procname = "protected_regular", .data = &sysctl_protected_regular, .maxlen = sizeof(int), - .mode = 0600, + .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, From 504ed164d7cd858d25ed5e3413928e1397f4c567 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Tue, 17 May 2022 22:33:20 +0800 Subject: [PATCH 57/65] MAINTAINERS: add Muchun as a memcg reviewer I have been focusing on mm for the past two years. e.g. developing, fixing bugs, reviewing. I have fixed lots of races (including memcg). I would like to help people working on memcg or related by reviewing their work. Let me be Cc'd on patches related to memcg. Link: https://lkml.kernel.org/r/20220517143320.99649-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Shakeel Butt Acked-by: Michal Hocko Acked-by: Johannes Weiner Acked-by: Roman Gushchin Acked-by: FanJun Kong Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 886265f04061..5eede9e773a6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5023,6 +5023,7 @@ M: Johannes Weiner M: Michal Hocko M: Roman Gushchin M: Shakeel Butt +R: Muchun Song L: cgroups@vger.kernel.org L: linux-mm@kvack.org S: Maintained From 4dcc3f96e7439f9a3a6e47d7fc147aad1338ddc4 Mon Sep 17 00:00:00 2001 From: Chung-Chiang Cheng Date: Tue, 3 May 2022 23:25:33 +0800 Subject: [PATCH 58/65] fat: split fat_truncate_time() into separate functions Separate fat_truncate_time() to each timestamps for later creation time work. This patch does not introduce any functional changes, it's merely refactoring change. Link: https://lkml.kernel.org/r/20220503152536.2503003-1-cccheng@synology.com Signed-off-by: Chung-Chiang Cheng Acked-by: OGAWA Hirofumi Signed-off-by: Andrew Morton --- fs/fat/fat.h | 6 +++++ fs/fat/misc.c | 74 ++++++++++++++++++++++++++++++++------------------- 2 files changed, 53 insertions(+), 27 deletions(-) diff --git a/fs/fat/fat.h b/fs/fat/fat.h index 2cf85a6e0d99..b7f60b366030 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -453,6 +453,12 @@ extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec64 *ts, __le16 __time, __le16 __date, u8 time_cs); extern void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec64 *ts, __le16 *time, __le16 *date, u8 *time_cs); +extern struct timespec64 fat_truncate_atime(const struct msdos_sb_info *sbi, + const struct timespec64 *ts); +extern struct timespec64 fat_truncate_crtime(const struct msdos_sb_info *sbi, + const struct timespec64 *ts); +extern struct timespec64 fat_truncate_mtime(const struct msdos_sb_info *sbi, + const struct timespec64 *ts); extern int fat_truncate_time(struct inode *inode, struct timespec64 *now, int flags); extern int fat_update_time(struct inode *inode, struct timespec64 *now, diff --git a/fs/fat/misc.c b/fs/fat/misc.c index 855477d89f41..80c6f8b3dc75 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -193,7 +193,7 @@ static long days_in_year[] = { 0, 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0, }; -static inline int fat_tz_offset(struct msdos_sb_info *sbi) +static inline int fat_tz_offset(const struct msdos_sb_info *sbi) { return (sbi->options.tz_set ? -sbi->options.time_offset : @@ -288,16 +288,49 @@ static inline struct timespec64 fat_timespec64_trunc_10ms(struct timespec64 ts) return ts; } +/* + * truncate atime to 24 hour granularity (00:00:00 in local timezone) + */ +struct timespec64 fat_truncate_atime(const struct msdos_sb_info *sbi, + const struct timespec64 *ts) +{ + /* to localtime */ + time64_t seconds = ts->tv_sec - fat_tz_offset(sbi); + s32 remainder; + + div_s64_rem(seconds, SECS_PER_DAY, &remainder); + /* to day boundary, and back to unix time */ + seconds = seconds + fat_tz_offset(sbi) - remainder; + + return (struct timespec64){ seconds, 0 }; +} + +/* + * truncate creation time with appropriate granularity: + * msdos - 2 seconds + * vfat - 10 milliseconds + */ +struct timespec64 fat_truncate_crtime(const struct msdos_sb_info *sbi, + const struct timespec64 *ts) +{ + if (sbi->options.isvfat) + return fat_timespec64_trunc_10ms(*ts); + else + return fat_timespec64_trunc_2secs(*ts); +} + +/* + * truncate mtime to 2 second granularity + */ +struct timespec64 fat_truncate_mtime(const struct msdos_sb_info *sbi, + const struct timespec64 *ts) +{ + return fat_timespec64_trunc_2secs(*ts); +} + /* * truncate the various times with appropriate granularity: - * root inode: - * all times always 0 - * all other inodes: - * mtime - 2 seconds - * ctime - * msdos - 2 seconds - * vfat - 10 milliseconds - * atime - 24 hours (00:00:00 in local timezone) + * all times in root node are always 0 */ int fat_truncate_time(struct inode *inode, struct timespec64 *now, int flags) { @@ -312,25 +345,12 @@ int fat_truncate_time(struct inode *inode, struct timespec64 *now, int flags) ts = current_time(inode); } - if (flags & S_ATIME) { - /* to localtime */ - time64_t seconds = now->tv_sec - fat_tz_offset(sbi); - s32 remainder; - - div_s64_rem(seconds, SECS_PER_DAY, &remainder); - /* to day boundary, and back to unix time */ - seconds = seconds + fat_tz_offset(sbi) - remainder; - - inode->i_atime = (struct timespec64){ seconds, 0 }; - } - if (flags & S_CTIME) { - if (sbi->options.isvfat) - inode->i_ctime = fat_timespec64_trunc_10ms(*now); - else - inode->i_ctime = fat_timespec64_trunc_2secs(*now); - } + if (flags & S_ATIME) + inode->i_atime = fat_truncate_atime(sbi, now); + if (flags & S_CTIME) + inode->i_ctime = fat_truncate_crtime(sbi, now); if (flags & S_MTIME) - inode->i_mtime = fat_timespec64_trunc_2secs(*now); + inode->i_mtime = fat_truncate_mtime(sbi, now); return 0; } From 0f9d148167c53a7029aba29cdc45072027033b72 Mon Sep 17 00:00:00 2001 From: Chung-Chiang Cheng Date: Tue, 3 May 2022 23:25:34 +0800 Subject: [PATCH 59/65] fat: ignore ctime updates, and keep ctime identical to mtime in memory FAT supports creation time but not change time, and there was no corresponding timestamp for creation time in previous VFS. The original implementation took the compromise of saving the in-memory change time into the on-disk creation time field, but this would lead to compatibility issues with non-linux systems. To address this issue, this patch changes the behavior of ctime. It will no longer be loaded and stored from the creation time on disk. Instead of that, it'll be consistent with the in-memory mtime and share the same on-disk field. All updates to mtime will also be applied to ctime in memory, while all updates to ctime will be ignored. Link: https://lkml.kernel.org/r/20220503152536.2503003-2-cccheng@synology.com Signed-off-by: Chung-Chiang Cheng Acked-by: OGAWA Hirofumi Signed-off-by: Andrew Morton --- fs/fat/inode.c | 11 ++++------- fs/fat/misc.c | 9 ++++++--- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/fat/inode.c b/fs/fat/inode.c index cb698a827c9a..d0371913f496 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -567,12 +567,11 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) & ~((loff_t)sbi->cluster_size - 1)) >> 9; fat_time_fat2unix(sbi, &inode->i_mtime, de->time, de->date, 0); - if (sbi->options.isvfat) { - fat_time_fat2unix(sbi, &inode->i_ctime, de->ctime, - de->cdate, de->ctime_cs); + inode->i_ctime = inode->i_mtime; + if (sbi->options.isvfat) fat_time_fat2unix(sbi, &inode->i_atime, 0, de->adate, 0); - } else - fat_truncate_time(inode, &inode->i_mtime, S_ATIME|S_CTIME); + else + inode->i_atime = fat_truncate_atime(sbi, &inode->i_mtime); return 0; } @@ -888,8 +887,6 @@ retry: &raw_entry->date, NULL); if (sbi->options.isvfat) { __le16 atime; - fat_time_unix2fat(sbi, &inode->i_ctime, &raw_entry->ctime, - &raw_entry->cdate, &raw_entry->ctime_cs); fat_time_unix2fat(sbi, &inode->i_atime, &atime, &raw_entry->adate, NULL); } diff --git a/fs/fat/misc.c b/fs/fat/misc.c index 80c6f8b3dc75..8ebe49e315ab 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -347,10 +347,13 @@ int fat_truncate_time(struct inode *inode, struct timespec64 *now, int flags) if (flags & S_ATIME) inode->i_atime = fat_truncate_atime(sbi, now); - if (flags & S_CTIME) - inode->i_ctime = fat_truncate_crtime(sbi, now); + /* + * ctime and mtime share the same on-disk field, and should be + * identical in memory. all mtime updates will be applied to ctime, + * but ctime updates are ignored. + */ if (flags & S_MTIME) - inode->i_mtime = fat_truncate_mtime(sbi, now); + inode->i_mtime = inode->i_ctime = fat_truncate_mtime(sbi, now); return 0; } From 30abce053f811f52688a5b739c3e4ba98d34070d Mon Sep 17 00:00:00 2001 From: Chung-Chiang Cheng Date: Tue, 3 May 2022 23:25:35 +0800 Subject: [PATCH 60/65] fat: report creation time in statx creation time is no longer mixed with change time. Add an in-memory field for it, and report it in statx if supported. Link: https://lkml.kernel.org/r/20220503152536.2503003-3-cccheng@synology.com Signed-off-by: Chung-Chiang Cheng Acked-by: OGAWA Hirofumi Signed-off-by: Andrew Morton --- fs/fat/fat.h | 1 + fs/fat/file.c | 16 ++++++++++++---- fs/fat/inode.c | 10 ++++++++-- 3 files changed, 21 insertions(+), 6 deletions(-) diff --git a/fs/fat/fat.h b/fs/fat/fat.h index b7f60b366030..6ed05ac0e694 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -126,6 +126,7 @@ struct msdos_inode_info { struct hlist_node i_fat_hash; /* hash by i_location */ struct hlist_node i_dir_hash; /* hash by i_logstart */ struct rw_semaphore truncate_lock; /* protect bmap against truncate */ + struct timespec64 i_crtime; /* File creation (birth) time */ struct inode vfs_inode; }; diff --git a/fs/fat/file.c b/fs/fat/file.c index a5a309fcc7fa..8f5218450a3a 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -399,13 +399,21 @@ int fat_getattr(struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct inode *inode = d_inode(path->dentry); - generic_fillattr(mnt_userns, inode, stat); - stat->blksize = MSDOS_SB(inode->i_sb)->cluster_size; + struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); - if (MSDOS_SB(inode->i_sb)->options.nfs == FAT_NFS_NOSTALE_RO) { + generic_fillattr(mnt_userns, inode, stat); + stat->blksize = sbi->cluster_size; + + if (sbi->options.nfs == FAT_NFS_NOSTALE_RO) { /* Use i_pos for ino. This is used as fileid of nfs. */ - stat->ino = fat_i_pos_read(MSDOS_SB(inode->i_sb), inode); + stat->ino = fat_i_pos_read(sbi, inode); } + + if (sbi->options.isvfat && request_mask & STATX_BTIME) { + stat->result_mask |= STATX_BTIME; + stat->btime = MSDOS_I(inode)->i_crtime; + } + return 0; } EXPORT_SYMBOL_GPL(fat_getattr); diff --git a/fs/fat/inode.c b/fs/fat/inode.c index d0371913f496..32e721b88a49 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -568,9 +568,11 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) fat_time_fat2unix(sbi, &inode->i_mtime, de->time, de->date, 0); inode->i_ctime = inode->i_mtime; - if (sbi->options.isvfat) + if (sbi->options.isvfat) { fat_time_fat2unix(sbi, &inode->i_atime, 0, de->adate, 0); - else + fat_time_fat2unix(sbi, &MSDOS_I(inode)->i_crtime, de->ctime, + de->cdate, de->ctime_cs); + } else inode->i_atime = fat_truncate_atime(sbi, &inode->i_mtime); return 0; @@ -756,6 +758,8 @@ static struct inode *fat_alloc_inode(struct super_block *sb) ei->i_logstart = 0; ei->i_attrs = 0; ei->i_pos = 0; + ei->i_crtime.tv_sec = 0; + ei->i_crtime.tv_nsec = 0; return &ei->vfs_inode; } @@ -889,6 +893,8 @@ retry: __le16 atime; fat_time_unix2fat(sbi, &inode->i_atime, &atime, &raw_entry->adate, NULL); + fat_time_unix2fat(sbi, &MSDOS_I(inode)->i_crtime, &raw_entry->ctime, + &raw_entry->cdate, &raw_entry->ctime_cs); } spin_unlock(&sbi->inode_hash_lock); mark_buffer_dirty(bh); From 1213375077451337c2381a8b5b88502a3fc394a2 Mon Sep 17 00:00:00 2001 From: Chung-Chiang Cheng Date: Tue, 3 May 2022 23:25:36 +0800 Subject: [PATCH 61/65] fat: remove time truncations in vfat_create/vfat_mkdir All the timestamps in vfat_create() and vfat_mkdir() come from fat_time_fat2unix() which ensures time granularity. We don't need to truncate them to fit FAT's format. Moreover, fat_truncate_crtime() and fat_timespec64_trunc_10ms() are also removed because there is no caller anymore. Link: https://lkml.kernel.org/r/20220503152536.2503003-4-cccheng@synology.com Signed-off-by: Chung-Chiang Cheng Acked-by: OGAWA Hirofumi Signed-off-by: Andrew Morton --- fs/fat/fat.h | 2 -- fs/fat/misc.c | 21 --------------------- fs/fat/namei_vfat.c | 4 ---- 3 files changed, 27 deletions(-) diff --git a/fs/fat/fat.h b/fs/fat/fat.h index 6ed05ac0e694..a415c02ede39 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -456,8 +456,6 @@ extern void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec64 *ts, __le16 *time, __le16 *date, u8 *time_cs); extern struct timespec64 fat_truncate_atime(const struct msdos_sb_info *sbi, const struct timespec64 *ts); -extern struct timespec64 fat_truncate_crtime(const struct msdos_sb_info *sbi, - const struct timespec64 *ts); extern struct timespec64 fat_truncate_mtime(const struct msdos_sb_info *sbi, const struct timespec64 *ts); extern int fat_truncate_time(struct inode *inode, struct timespec64 *now, diff --git a/fs/fat/misc.c b/fs/fat/misc.c index 8ebe49e315ab..7e5d6ae305f2 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -281,13 +281,6 @@ static inline struct timespec64 fat_timespec64_trunc_2secs(struct timespec64 ts) return (struct timespec64){ ts.tv_sec & ~1ULL, 0 }; } -static inline struct timespec64 fat_timespec64_trunc_10ms(struct timespec64 ts) -{ - if (ts.tv_nsec) - ts.tv_nsec -= ts.tv_nsec % 10000000UL; - return ts; -} - /* * truncate atime to 24 hour granularity (00:00:00 in local timezone) */ @@ -305,20 +298,6 @@ struct timespec64 fat_truncate_atime(const struct msdos_sb_info *sbi, return (struct timespec64){ seconds, 0 }; } -/* - * truncate creation time with appropriate granularity: - * msdos - 2 seconds - * vfat - 10 milliseconds - */ -struct timespec64 fat_truncate_crtime(const struct msdos_sb_info *sbi, - const struct timespec64 *ts) -{ - if (sbi->options.isvfat) - return fat_timespec64_trunc_10ms(*ts); - else - return fat_timespec64_trunc_2secs(*ts); -} - /* * truncate mtime to 2 second granularity */ diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 5369d82e0bfb..c573314806cf 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -780,8 +780,6 @@ static int vfat_create(struct user_namespace *mnt_userns, struct inode *dir, goto out; } inode_inc_iversion(inode); - fat_truncate_time(inode, &ts, S_ATIME|S_CTIME|S_MTIME); - /* timestamp is already written, so mark_inode_dirty() is unneeded. */ d_instantiate(dentry, inode); out: @@ -878,8 +876,6 @@ static int vfat_mkdir(struct user_namespace *mnt_userns, struct inode *dir, } inode_inc_iversion(inode); set_nlink(inode, 2); - fat_truncate_time(inode, &ts, S_ATIME|S_CTIME|S_MTIME); - /* timestamp is already written, so mark_inode_dirty() is unneeded. */ d_instantiate(dentry, inode); From 69bc169ec33f101fe6d296976aaa34f51edbaa9e Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 17 May 2022 10:36:46 +0100 Subject: [PATCH 62/65] fs/ntfs: remove redundant variable idx The variable idx is assigned a value and is never read. The variable is not used and is redundant, remove it. Cleans up clang scan build warning: warning: Although the value stored to 'idx' is used in the enclosing expression, the value is never actually read from 'idx' [deadcode.DeadStores] Link: https://lkml.kernel.org/r/20220517093646.93628-2-colin.i.king@gmail.com Signed-off-by: Colin Ian King Reviewed-by: Anton Altaparmakov Signed-off-by: Andrew Morton --- fs/ntfs/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 2ae25e48a41a..329fca1fa619 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -1772,11 +1772,11 @@ static ssize_t ntfs_perform_write(struct file *file, struct iov_iter *i, last_vcn = -1; do { VCN vcn; - pgoff_t idx, start_idx; + pgoff_t start_idx; unsigned ofs, do_pages, u; size_t copied; - start_idx = idx = pos >> PAGE_SHIFT; + start_idx = pos >> PAGE_SHIFT; ofs = pos & ~PAGE_MASK; bytes = PAGE_SIZE - ofs; do_pages = 1; From 0b6d14e3dbde9de158263ce5e4a27112693e71ac Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Wed, 18 May 2022 16:52:23 -0700 Subject: [PATCH 63/65] ocfs2: dlmfs: don't clear USER_LOCK_ATTACHED when destroying lock The following function is the only place that checks USER_LOCK_ATTACHED. This flag is set when lock request is granted through user_ast() and only the following function will clear it. Checking of this flag here is to make sure ocfs2_dlm_unlock is not issued if this lock is never granted. For example, lock file is created and then get removed, open file never happens. Clearing the flag here is not necessary because this is the only function that checks it, if another flow is executing user_dlm_destroy_lock(), it will bail out at the beginning because of USER_LOCK_IN_TEARDOWN and never check USER_LOCK_ATTACHED. Drop the clear, so we don't need take care of it for the following error handling patch. int user_dlm_destroy_lock(struct user_lock_res *lockres) { ... status = 0; if (!(lockres->l_flags & USER_LOCK_ATTACHED)) { spin_unlock(&lockres->l_lock); goto bail; } lockres->l_flags &= ~USER_LOCK_ATTACHED; lockres->l_flags |= USER_LOCK_BUSY; spin_unlock(&lockres->l_lock); status = ocfs2_dlm_unlock(conn, &lockres->l_lksb, DLM_LKF_VALBLK); if (status) { user_log_dlm_error("ocfs2_dlm_unlock", status, lockres); goto bail; } ... } V1 discussion with Joseph: https://lore.kernel.org/all/7b620c53-0c45-da2c-829e-26195cbe7d4e@linux.alibaba.com/T/ Link: https://lkml.kernel.org/r/20220518235224.87100-1-junxiao.bi@oracle.com Signed-off-by: Junxiao Bi Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/dlmfs/userdlm.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/ocfs2/dlmfs/userdlm.c b/fs/ocfs2/dlmfs/userdlm.c index 29f183a15798..af0be612589c 100644 --- a/fs/ocfs2/dlmfs/userdlm.c +++ b/fs/ocfs2/dlmfs/userdlm.c @@ -619,7 +619,6 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres) goto bail; } - lockres->l_flags &= ~USER_LOCK_ATTACHED; lockres->l_flags |= USER_LOCK_BUSY; spin_unlock(&lockres->l_lock); From 863e0d81b6683c4cbc588ad831f560c90e494bef Mon Sep 17 00:00:00 2001 From: Junxiao Bi via Ocfs2-devel Date: Wed, 18 May 2022 16:52:24 -0700 Subject: [PATCH 64/65] ocfs2: dlmfs: fix error handling of user_dlm_destroy_lock When user_dlm_destroy_lock failed, it didn't clean up the flags it set before exit. For USER_LOCK_IN_TEARDOWN, if this function fails because of lock is still in used, next time when unlink invokes this function, it will return succeed, and then unlink will remove inode and dentry if lock is not in used(file closed), but the dlm lock is still linked in dlm lock resource, then when bast come in, it will trigger a panic due to user-after-free. See the following panic call trace. To fix this, USER_LOCK_IN_TEARDOWN should be reverted if fail. And also error should be returned if USER_LOCK_IN_TEARDOWN is set to let user know that unlink fail. For the case of ocfs2_dlm_unlock failure, besides USER_LOCK_IN_TEARDOWN, USER_LOCK_BUSY is also required to be cleared. Even though spin lock is released in between, but USER_LOCK_IN_TEARDOWN is still set, for USER_LOCK_BUSY, if before every place that waits on this flag, USER_LOCK_IN_TEARDOWN is checked to bail out, that will make sure no flow waits on the busy flag set by user_dlm_destroy_lock(), then we can simplely revert USER_LOCK_BUSY when ocfs2_dlm_unlock fails. Fix user_dlm_cluster_lock() which is the only function not following this. [ 941.336392] (python,26174,16):dlmfs_unlink:562 ERROR: unlink 004fb0000060000b5a90b8c847b72e1, error -16 from destroy [ 989.757536] ------------[ cut here ]------------ [ 989.757709] kernel BUG at fs/ocfs2/dlmfs/userdlm.c:173! [ 989.757876] invalid opcode: 0000 [#1] SMP [ 989.758027] Modules linked in: ksplice_2zhuk2jr_ib_ipoib_new(O) ksplice_2zhuk2jr(O) mptctl mptbase xen_netback xen_blkback xen_gntalloc xen_gntdev xen_evtchn cdc_ether usbnet mii ocfs2 jbd2 rpcsec_gss_krb5 auth_rpcgss nfsv4 nfsv3 nfs_acl nfs fscache lockd grace ocfs2_dlmfs ocfs2_stack_o2cb ocfs2_dlm ocfs2_nodemanager ocfs2_stackglue configfs bnx2fc fcoe libfcoe libfc scsi_transport_fc sunrpc ipmi_devintf bridge stp llc rds_rdma rds bonding ib_sdp ib_ipoib rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm falcon_lsm_serviceable(PE) falcon_nf_netcontain(PE) mlx4_vnic falcon_kal(E) falcon_lsm_pinned_13402(E) mlx4_ib ib_sa ib_mad ib_core ib_addr xenfs xen_privcmd dm_multipath iTCO_wdt iTCO_vendor_support pcspkr sb_edac edac_core i2c_i801 lpc_ich mfd_core ipmi_ssif i2c_core ipmi_si ipmi_msghandler [ 989.760686] ioatdma sg ext3 jbd mbcache sd_mod ahci libahci ixgbe dca ptp pps_core vxlan udp_tunnel ip6_udp_tunnel megaraid_sas mlx4_core crc32c_intel be2iscsi bnx2i cnic uio cxgb4i cxgb4 cxgb3i libcxgbi ipv6 cxgb3 mdio libiscsi_tcp qla4xxx iscsi_boot_sysfs libiscsi scsi_transport_iscsi wmi dm_mirror dm_region_hash dm_log dm_mod [last unloaded: ksplice_2zhuk2jr_ib_ipoib_old] [ 989.761987] CPU: 10 PID: 19102 Comm: dlm_thread Tainted: P OE 4.1.12-124.57.1.el6uek.x86_64 #2 [ 989.762290] Hardware name: Oracle Corporation ORACLE SERVER X5-2/ASM,MOTHERBOARD,1U, BIOS 30350100 06/17/2021 [ 989.762599] task: ffff880178af6200 ti: ffff88017f7c8000 task.ti: ffff88017f7c8000 [ 989.762848] RIP: e030:[] [] __user_dlm_queue_lockres.part.4+0x76/0x80 [ocfs2_dlmfs] [ 989.763185] RSP: e02b:ffff88017f7cbcb8 EFLAGS: 00010246 [ 989.763353] RAX: 0000000000000000 RBX: ffff880174d48008 RCX: 0000000000000003 [ 989.763565] RDX: 0000000000120012 RSI: 0000000000000003 RDI: ffff880174d48170 [ 989.763778] RBP: ffff88017f7cbcc8 R08: ffff88021f4293b0 R09: 0000000000000000 [ 989.763991] R10: ffff880179c8c000 R11: 0000000000000003 R12: ffff880174d48008 [ 989.764204] R13: 0000000000000003 R14: ffff880179c8c000 R15: ffff88021db7a000 [ 989.764422] FS: 0000000000000000(0000) GS:ffff880247480000(0000) knlGS:ffff880247480000 [ 989.764685] CS: e033 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 989.764865] CR2: ffff8000007f6800 CR3: 0000000001ae0000 CR4: 0000000000042660 [ 989.765081] Stack: [ 989.765167] 0000000000000003 ffff880174d48040 ffff88017f7cbd18 ffffffffc07d455f [ 989.765442] ffff88017f7cbd88 ffffffff816fb639 ffff88017f7cbd38 ffff8800361b5600 [ 989.765717] ffff88021db7a000 ffff88021f429380 0000000000000003 ffffffffc0453020 [ 989.765991] Call Trace: [ 989.766093] [] user_bast+0x5f/0xf0 [ocfs2_dlmfs] [ 989.766287] [] ? schedule_timeout+0x169/0x2d0 [ 989.766475] [] ? o2dlm_lock_ast_wrapper+0x20/0x20 [ocfs2_stack_o2cb] [ 989.766738] [] o2dlm_blocking_ast_wrapper+0x1a/0x20 [ocfs2_stack_o2cb] [ 989.767010] [] dlm_do_local_bast+0x46/0xe0 [ocfs2_dlm] [ 989.767217] [] ? dlm_lockres_calc_usage+0x4c/0x60 [ocfs2_dlm] [ 989.767466] [] dlm_thread+0xa31/0x1140 [ocfs2_dlm] [ 989.767662] [] ? __schedule+0x24a/0x810 [ 989.767834] [] ? __schedule+0x23e/0x810 [ 989.768006] [] ? __schedule+0x24a/0x810 [ 989.768178] [] ? __schedule+0x23e/0x810 [ 989.768349] [] ? __schedule+0x24a/0x810 [ 989.768521] [] ? __schedule+0x23e/0x810 [ 989.768693] [] ? __schedule+0x24a/0x810 [ 989.768893] [] ? __schedule+0x23e/0x810 [ 989.769067] [] ? __schedule+0x24a/0x810 [ 989.769241] [] ? wait_woken+0x90/0x90 [ 989.769411] [] ? dlm_kick_thread+0x80/0x80 [ocfs2_dlm] [ 989.769617] [] kthread+0xcb/0xf0 [ 989.769774] [] ? __schedule+0x24a/0x810 [ 989.769945] [] ? __schedule+0x24a/0x810 [ 989.770117] [] ? kthread_create_on_node+0x180/0x180 [ 989.770321] [] ret_from_fork+0x61/0x90 [ 989.770492] [] ? kthread_create_on_node+0x180/0x180 [ 989.770689] Code: d0 00 00 00 f0 45 7d c0 bf 00 20 00 00 48 89 83 c0 00 00 00 48 89 83 c8 00 00 00 e8 55 c1 8c c0 83 4b 04 10 48 83 c4 08 5b 5d c3 <0f> 0b 0f 1f 84 00 00 00 00 00 55 48 89 e5 41 55 41 54 53 48 83 [ 989.771892] RIP [] __user_dlm_queue_lockres.part.4+0x76/0x80 [ocfs2_dlmfs] [ 989.772174] RSP [ 989.772704] ---[ end trace ebd1e38cebcc93a8 ]--- [ 989.772907] Kernel panic - not syncing: Fatal exception [ 989.773173] Kernel Offset: disabled Link: https://lkml.kernel.org/r/20220518235224.87100-2-junxiao.bi@oracle.com Signed-off-by: Junxiao Bi Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Joseph Qi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Cc: Signed-off-by: Andrew Morton --- fs/ocfs2/dlmfs/userdlm.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/dlmfs/userdlm.c b/fs/ocfs2/dlmfs/userdlm.c index af0be612589c..617c92e7b925 100644 --- a/fs/ocfs2/dlmfs/userdlm.c +++ b/fs/ocfs2/dlmfs/userdlm.c @@ -433,6 +433,11 @@ again: } spin_lock(&lockres->l_lock); + if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { + spin_unlock(&lockres->l_lock); + status = -EAGAIN; + goto bail; + } /* We only compare against the currently granted level * here. If the lock is blocked waiting on a downconvert, @@ -595,7 +600,7 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres) spin_lock(&lockres->l_lock); if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { spin_unlock(&lockres->l_lock); - return 0; + goto bail; } lockres->l_flags |= USER_LOCK_IN_TEARDOWN; @@ -609,12 +614,17 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres) } if (lockres->l_ro_holders || lockres->l_ex_holders) { + lockres->l_flags &= ~USER_LOCK_IN_TEARDOWN; spin_unlock(&lockres->l_lock); goto bail; } status = 0; if (!(lockres->l_flags & USER_LOCK_ATTACHED)) { + /* + * lock is never requested, leave USER_LOCK_IN_TEARDOWN set + * to avoid new lock request coming in. + */ spin_unlock(&lockres->l_lock); goto bail; } @@ -624,6 +634,10 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres) status = ocfs2_dlm_unlock(conn, &lockres->l_lksb, DLM_LKF_VALBLK); if (status) { + spin_lock(&lockres->l_lock); + lockres->l_flags &= ~USER_LOCK_IN_TEARDOWN; + lockres->l_flags &= ~USER_LOCK_BUSY; + spin_unlock(&lockres->l_lock); user_log_dlm_error("ocfs2_dlm_unlock", status, lockres); goto bail; } From 3159d79b56c15068aeb7e4630cd5f6dacd20fda4 Mon Sep 17 00:00:00 2001 From: Congyu Liu Date: Mon, 23 May 2022 05:35:31 +0000 Subject: [PATCH 65/65] kcov: update pos before writing pc in trace function In __sanitizer_cov_trace_pc(), previously we write pc before updating pos. However, some early interrupt code could bypass check_kcov_mode() check and invoke __sanitizer_cov_trace_pc(). If such interrupt is raised between writing pc and updating pos, the pc could be overitten by the recursive __sanitizer_cov_trace_pc(). As suggested by Dmitry, we cold update pos before writing pc to avoid such interleaving. Apply the same change to write_comp_data(). Link: https://lkml.kernel.org/r/20220523053531.1572793-1-liu3101@purdue.edu Signed-off-by: Congyu Liu Reviewed-by: Dmitry Vyukov Cc: Andrey Konovalov Signed-off-by: Andrew Morton --- kernel/kcov.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/kernel/kcov.c b/kernel/kcov.c index b3732b210593..e19c84b02452 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -204,8 +204,16 @@ void notrace __sanitizer_cov_trace_pc(void) /* The first 64-bit word is the number of subsequent PCs. */ pos = READ_ONCE(area[0]) + 1; if (likely(pos < t->kcov_size)) { - area[pos] = ip; + /* Previously we write pc before updating pos. However, some + * early interrupt code could bypass check_kcov_mode() check + * and invoke __sanitizer_cov_trace_pc(). If such interrupt is + * raised between writing pc and updating pos, the pc could be + * overitten by the recursive __sanitizer_cov_trace_pc(). + * Update pos before writing pc to avoid such interleaving. + */ WRITE_ONCE(area[0], pos); + barrier(); + area[pos] = ip; } } EXPORT_SYMBOL(__sanitizer_cov_trace_pc); @@ -236,11 +244,13 @@ static void notrace write_comp_data(u64 type, u64 arg1, u64 arg2, u64 ip) start_index = 1 + count * KCOV_WORDS_PER_CMP; end_pos = (start_index + KCOV_WORDS_PER_CMP) * sizeof(u64); if (likely(end_pos <= max_pos)) { + /* See comment in __sanitizer_cov_trace_pc(). */ + WRITE_ONCE(area[0], count + 1); + barrier(); area[start_index] = type; area[start_index + 1] = arg1; area[start_index + 2] = arg2; area[start_index + 3] = ip; - WRITE_ONCE(area[0], count + 1); } }