linux-stable/fs/smb/client/cifsfs.c

2019 lines
55 KiB
C
Raw Normal View History

// SPDX-License-Identifier: LGPL-2.1
/*
*
* Copyright (C) International Business Machines Corp., 2002,2008
* Author(s): Steve French (sfrench@us.ibm.com)
*
* Common Internet FileSystem (CIFS) client
*
*/
/* Note that BB means BUGBUG (ie something to fix eventually) */
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/filelock.h>
#include <linux/mount.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/list.h>
#include <linux/seq_file.h>
#include <linux/vfs.h>
#include <linux/mempool.h>
#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
#include <linux/namei.h>
#include <linux/random.h>
#include <linux/splice.h>
#include <linux/uuid.h>
#include <linux/xattr.h>
#include <uapi/linux/magic.h>
#include <net/ipv6.h>
#include "cifsfs.h"
#include "cifspdu.h"
#define DECLARE_GLOBALS_HERE
#include "cifsglob.h"
#include "cifsproto.h"
#include "cifs_debug.h"
#include "cifs_fs_sb.h"
#include <linux/mm.h>
#include <linux/key-type.h>
#include "cifs_spnego.h"
#include "fscache.h"
#ifdef CONFIG_CIFS_DFS_UPCALL
#include "dfs_cache.h"
#endif
#ifdef CONFIG_CIFS_SWN_UPCALL
#include "netlink.h"
#endif
#include "fs_context.h"
#include "cached_dir.h"
/*
* DOS dates from 1980/1/1 through 2107/12/31
* Protocol specifications indicate the range should be to 119, which
* limits maximum year to 2099. But this range has not been checked.
*/
#define SMB_DATE_MAX (127<<9 | 12<<5 | 31)
#define SMB_DATE_MIN (0<<9 | 1<<5 | 1)
#define SMB_TIME_MAX (23<<11 | 59<<5 | 29)
int cifsFYI = 0;
bool traceSMB;
bool enable_oplocks = true;
bool linuxExtEnabled = true;
bool lookupCacheEnabled = true;
bool disable_legacy_dialects; /* false by default */
bool enable_gcm_256 = true;
bool require_gcm_256; /* false by default */
bool enable_negotiate_signing; /* false by default */
unsigned int global_secflags = CIFSSEC_DEF;
/* unsigned int ntlmv2_support = 0; */
unsigned int sign_CIFS_PDUs = 1;
/*
* Global transaction id (XID) information
*/
unsigned int GlobalCurrentXid; /* protected by GlobalMid_Sem */
unsigned int GlobalTotalActiveXid; /* prot by GlobalMid_Sem */
unsigned int GlobalMaxActiveXid; /* prot by GlobalMid_Sem */
spinlock_t GlobalMid_Lock; /* protects above & list operations on midQ entries */
/*
* Global counters, updated atomically
*/
atomic_t sesInfoAllocCount;
atomic_t tconInfoAllocCount;
atomic_t tcpSesNextId;
atomic_t tcpSesAllocCount;
atomic_t tcpSesReconnectCount;
atomic_t tconInfoReconnectCount;
atomic_t mid_count;
atomic_t buf_alloc_count;
atomic_t small_buf_alloc_count;
#ifdef CONFIG_CIFS_STATS2
atomic_t total_buf_alloc_count;
atomic_t total_small_buf_alloc_count;
#endif/* STATS2 */
struct list_head cifs_tcp_ses_list;
spinlock_t cifs_tcp_ses_lock;
static const struct super_operations cifs_super_ops;
unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE;
module_param(CIFSMaxBufSize, uint, 0444);
MODULE_PARM_DESC(CIFSMaxBufSize, "Network buffer size (not including header) "
"for CIFS requests. "
"Default: 16384 Range: 8192 to 130048");
unsigned int cifs_min_rcv = CIFS_MIN_RCV_POOL;
module_param(cifs_min_rcv, uint, 0444);
MODULE_PARM_DESC(cifs_min_rcv, "Network buffers in pool. Default: 4 Range: "
"1 to 64");
unsigned int cifs_min_small = 30;
module_param(cifs_min_small, uint, 0444);
MODULE_PARM_DESC(cifs_min_small, "Small network buffers in pool. Default: 30 "
"Range: 2 to 256");
unsigned int cifs_max_pending = CIFS_MAX_REQ;
module_param(cifs_max_pending, uint, 0444);
MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server for "
"CIFS/SMB1 dialect (N/A for SMB3) "
"Default: 32767 Range: 2 to 32767.");
unsigned int dir_cache_timeout = 30;
module_param(dir_cache_timeout, uint, 0644);
MODULE_PARM_DESC(dir_cache_timeout, "Number of seconds to cache directory contents for which we have a lease. Default: 30 "
"Range: 1 to 65000 seconds, 0 to disable caching dir contents");
#ifdef CONFIG_CIFS_STATS2
unsigned int slow_rsp_threshold = 1;
module_param(slow_rsp_threshold, uint, 0644);
MODULE_PARM_DESC(slow_rsp_threshold, "Amount of time (in seconds) to wait "
"before logging that a response is delayed. "
"Default: 1 (if set to 0 disables msg).");
#endif /* STATS2 */
module_param(enable_oplocks, bool, 0644);
MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks. Default: y/Y/1");
module_param(enable_gcm_256, bool, 0644);
MODULE_PARM_DESC(enable_gcm_256, "Enable requesting strongest (256 bit) GCM encryption. Default: n/N/0");
module_param(require_gcm_256, bool, 0644);
MODULE_PARM_DESC(require_gcm_256, "Require strongest (256 bit) GCM encryption. Default: n/N/0");
module_param(enable_negotiate_signing, bool, 0644);
MODULE_PARM_DESC(enable_negotiate_signing, "Enable negotiating packet signing algorithm with server. Default: n/N/0");
module_param(disable_legacy_dialects, bool, 0644);
MODULE_PARM_DESC(disable_legacy_dialects, "To improve security it may be "
"helpful to restrict the ability to "
"override the default dialects (SMB2.1, "
"SMB3 and SMB3.02) on mount with old "
"dialects (CIFS/SMB1 and SMB2) since "
"vers=1.0 (CIFS/SMB1) and vers=2.0 are weaker"
" and less secure. Default: n/N/0");
extern mempool_t *cifs_sm_req_poolp;
extern mempool_t *cifs_req_poolp;
extern mempool_t *cifs_mid_poolp;
struct workqueue_struct *cifsiod_wq;
struct workqueue_struct *decrypt_wq;
cifs: move cifsFileInfo_put logic into a work-queue This patch moves the final part of the cifsFileInfo_put() logic where we need a write lock on lock_sem to be processed in a separate thread that holds no other locks. This is to prevent deadlocks like the one below: > there are 6 processes looping to while trying to down_write > cinode->lock_sem, 5 of them from _cifsFileInfo_put, and one from > cifs_new_fileinfo > > and there are 5 other processes which are blocked, several of them > waiting on either PG_writeback or PG_locked (which are both set), all > for the same page of the file > > 2 inode_lock() (inode->i_rwsem) for the file > 1 wait_on_page_writeback() for the page > 1 down_read(inode->i_rwsem) for the inode of the directory > 1 inode_lock()(inode->i_rwsem) for the inode of the directory > 1 __lock_page > > > so processes are blocked waiting on: > page flags PG_locked and PG_writeback for one specific page > inode->i_rwsem for the directory > inode->i_rwsem for the file > cifsInodeInflock_sem > > > > here are the more gory details (let me know if I need to provide > anything more/better): > > [0 00:48:22.765] [UN] PID: 8863 TASK: ffff8c691547c5c0 CPU: 3 > COMMAND: "reopen_file" > #0 [ffff9965007e3ba8] __schedule at ffffffff9b6e6095 > #1 [ffff9965007e3c38] schedule at ffffffff9b6e64df > #2 [ffff9965007e3c48] rwsem_down_write_slowpath at ffffffff9af283d7 > #3 [ffff9965007e3cb8] legitimize_path at ffffffff9b0f975d > #4 [ffff9965007e3d08] path_openat at ffffffff9b0fe55d > #5 [ffff9965007e3dd8] do_filp_open at ffffffff9b100a33 > #6 [ffff9965007e3ee0] do_sys_open at ffffffff9b0eb2d6 > #7 [ffff9965007e3f38] do_syscall_64 at ffffffff9ae04315 > * (I think legitimize_path is bogus) > > in path_openat > } else { > const char *s = path_init(nd, flags); > while (!(error = link_path_walk(s, nd)) && > (error = do_last(nd, file, op)) > 0) { <<<< > > do_last: > if (open_flag & O_CREAT) > inode_lock(dir->d_inode); <<<< > else > so it's trying to take inode->i_rwsem for the directory > > DENTRY INODE SUPERBLK TYPE PATH > ffff8c68bb8e79c0 ffff8c691158ef20 ffff8c6915bf9000 DIR /mnt/vm1_smb/ > inode.i_rwsem is ffff8c691158efc0 > > <struct rw_semaphore 0xffff8c691158efc0>: > owner: <struct task_struct 0xffff8c6914275d00> (UN - 8856 - > reopen_file), counter: 0x0000000000000003 > waitlist: 2 > 0xffff9965007e3c90 8863 reopen_file UN 0 1:29:22.926 > RWSEM_WAITING_FOR_WRITE > 0xffff996500393e00 9802 ls UN 0 1:17:26.700 > RWSEM_WAITING_FOR_READ > > > the owner of the inode.i_rwsem of the directory is: > > [0 00:00:00.109] [UN] PID: 8856 TASK: ffff8c6914275d00 CPU: 3 > COMMAND: "reopen_file" > #0 [ffff99650065b828] __schedule at ffffffff9b6e6095 > #1 [ffff99650065b8b8] schedule at ffffffff9b6e64df > #2 [ffff99650065b8c8] schedule_timeout at ffffffff9b6e9f89 > #3 [ffff99650065b940] msleep at ffffffff9af573a9 > #4 [ffff99650065b948] _cifsFileInfo_put.cold.63 at ffffffffc0a42dd6 [cifs] > #5 [ffff99650065ba38] cifs_writepage_locked at ffffffffc0a0b8f3 [cifs] > #6 [ffff99650065bab0] cifs_launder_page at ffffffffc0a0bb72 [cifs] > #7 [ffff99650065bb30] invalidate_inode_pages2_range at ffffffff9b04d4bd > #8 [ffff99650065bcb8] cifs_invalidate_mapping at ffffffffc0a11339 [cifs] > #9 [ffff99650065bcd0] cifs_revalidate_mapping at ffffffffc0a1139a [cifs] > #10 [ffff99650065bcf0] cifs_d_revalidate at ffffffffc0a014f6 [cifs] > #11 [ffff99650065bd08] path_openat at ffffffff9b0fe7f7 > #12 [ffff99650065bdd8] do_filp_open at ffffffff9b100a33 > #13 [ffff99650065bee0] do_sys_open at ffffffff9b0eb2d6 > #14 [ffff99650065bf38] do_syscall_64 at ffffffff9ae04315 > > cifs_launder_page is for page 0xffffd1e2c07d2480 > > crash> page.index,mapping,flags 0xffffd1e2c07d2480 > index = 0x8 > mapping = 0xffff8c68f3cd0db0 > flags = 0xfffffc0008095 > > PAGE-FLAG BIT VALUE > PG_locked 0 0000001 > PG_uptodate 2 0000004 > PG_lru 4 0000010 > PG_waiters 7 0000080 > PG_writeback 15 0008000 > > > inode is ffff8c68f3cd0c40 > inode.i_rwsem is ffff8c68f3cd0ce0 > DENTRY INODE SUPERBLK TYPE PATH > ffff8c68a1f1b480 ffff8c68f3cd0c40 ffff8c6915bf9000 REG > /mnt/vm1_smb/testfile.8853 > > > this process holds the inode->i_rwsem for the parent directory, is > laundering a page attached to the inode of the file it's opening, and in > _cifsFileInfo_put is trying to down_write the cifsInodeInflock_sem > for the file itself. > > > <struct rw_semaphore 0xffff8c68f3cd0ce0>: > owner: <struct task_struct 0xffff8c6914272e80> (UN - 8854 - > reopen_file), counter: 0x0000000000000003 > waitlist: 1 > 0xffff9965005dfd80 8855 reopen_file UN 0 1:29:22.912 > RWSEM_WAITING_FOR_WRITE > > this is the inode.i_rwsem for the file > > the owner: > > [0 00:48:22.739] [UN] PID: 8854 TASK: ffff8c6914272e80 CPU: 2 > COMMAND: "reopen_file" > #0 [ffff99650054fb38] __schedule at ffffffff9b6e6095 > #1 [ffff99650054fbc8] schedule at ffffffff9b6e64df > #2 [ffff99650054fbd8] io_schedule at ffffffff9b6e68e2 > #3 [ffff99650054fbe8] __lock_page at ffffffff9b03c56f > #4 [ffff99650054fc80] pagecache_get_page at ffffffff9b03dcdf > #5 [ffff99650054fcc0] grab_cache_page_write_begin at ffffffff9b03ef4c > #6 [ffff99650054fcd0] cifs_write_begin at ffffffffc0a064ec [cifs] > #7 [ffff99650054fd30] generic_perform_write at ffffffff9b03bba4 > #8 [ffff99650054fda8] __generic_file_write_iter at ffffffff9b04060a > #9 [ffff99650054fdf0] cifs_strict_writev.cold.70 at ffffffffc0a4469b [cifs] > #10 [ffff99650054fe48] new_sync_write at ffffffff9b0ec1dd > #11 [ffff99650054fed0] vfs_write at ffffffff9b0eed35 > #12 [ffff99650054ff00] ksys_write at ffffffff9b0eefd9 > #13 [ffff99650054ff38] do_syscall_64 at ffffffff9ae04315 > > the process holds the inode->i_rwsem for the file to which it's writing, > and is trying to __lock_page for the same page as in the other processes > > > the other tasks: > [0 00:00:00.028] [UN] PID: 8859 TASK: ffff8c6915479740 CPU: 2 > COMMAND: "reopen_file" > #0 [ffff9965007b39d8] __schedule at ffffffff9b6e6095 > #1 [ffff9965007b3a68] schedule at ffffffff9b6e64df > #2 [ffff9965007b3a78] schedule_timeout at ffffffff9b6e9f89 > #3 [ffff9965007b3af0] msleep at ffffffff9af573a9 > #4 [ffff9965007b3af8] cifs_new_fileinfo.cold.61 at ffffffffc0a42a07 [cifs] > #5 [ffff9965007b3b78] cifs_open at ffffffffc0a0709d [cifs] > #6 [ffff9965007b3cd8] do_dentry_open at ffffffff9b0e9b7a > #7 [ffff9965007b3d08] path_openat at ffffffff9b0fe34f > #8 [ffff9965007b3dd8] do_filp_open at ffffffff9b100a33 > #9 [ffff9965007b3ee0] do_sys_open at ffffffff9b0eb2d6 > #10 [ffff9965007b3f38] do_syscall_64 at ffffffff9ae04315 > > this is opening the file, and is trying to down_write cinode->lock_sem > > > [0 00:00:00.041] [UN] PID: 8860 TASK: ffff8c691547ae80 CPU: 2 > COMMAND: "reopen_file" > [0 00:00:00.057] [UN] PID: 8861 TASK: ffff8c6915478000 CPU: 3 > COMMAND: "reopen_file" > [0 00:00:00.059] [UN] PID: 8858 TASK: ffff8c6914271740 CPU: 2 > COMMAND: "reopen_file" > [0 00:00:00.109] [UN] PID: 8862 TASK: ffff8c691547dd00 CPU: 6 > COMMAND: "reopen_file" > #0 [ffff9965007c3c78] __schedule at ffffffff9b6e6095 > #1 [ffff9965007c3d08] schedule at ffffffff9b6e64df > #2 [ffff9965007c3d18] schedule_timeout at ffffffff9b6e9f89 > #3 [ffff9965007c3d90] msleep at ffffffff9af573a9 > #4 [ffff9965007c3d98] _cifsFileInfo_put.cold.63 at ffffffffc0a42dd6 [cifs] > #5 [ffff9965007c3e88] cifs_close at ffffffffc0a07aaf [cifs] > #6 [ffff9965007c3ea0] __fput at ffffffff9b0efa6e > #7 [ffff9965007c3ee8] task_work_run at ffffffff9aef1614 > #8 [ffff9965007c3f20] exit_to_usermode_loop at ffffffff9ae03d6f > #9 [ffff9965007c3f38] do_syscall_64 at ffffffff9ae0444c > > closing the file, and trying to down_write cifsi->lock_sem > > > [0 00:48:22.839] [UN] PID: 8857 TASK: ffff8c6914270000 CPU: 7 > COMMAND: "reopen_file" > #0 [ffff9965006a7cc8] __schedule at ffffffff9b6e6095 > #1 [ffff9965006a7d58] schedule at ffffffff9b6e64df > #2 [ffff9965006a7d68] io_schedule at ffffffff9b6e68e2 > #3 [ffff9965006a7d78] wait_on_page_bit at ffffffff9b03cac6 > #4 [ffff9965006a7e10] __filemap_fdatawait_range at ffffffff9b03b028 > #5 [ffff9965006a7ed8] filemap_write_and_wait at ffffffff9b040165 > #6 [ffff9965006a7ef0] cifs_flush at ffffffffc0a0c2fa [cifs] > #7 [ffff9965006a7f10] filp_close at ffffffff9b0e93f1 > #8 [ffff9965006a7f30] __x64_sys_close at ffffffff9b0e9a0e > #9 [ffff9965006a7f38] do_syscall_64 at ffffffff9ae04315 > > in __filemap_fdatawait_range > wait_on_page_writeback(page); > for the same page of the file > > > > [0 00:48:22.718] [UN] PID: 8855 TASK: ffff8c69142745c0 CPU: 7 > COMMAND: "reopen_file" > #0 [ffff9965005dfc98] __schedule at ffffffff9b6e6095 > #1 [ffff9965005dfd28] schedule at ffffffff9b6e64df > #2 [ffff9965005dfd38] rwsem_down_write_slowpath at ffffffff9af283d7 > #3 [ffff9965005dfdf0] cifs_strict_writev at ffffffffc0a0c40a [cifs] > #4 [ffff9965005dfe48] new_sync_write at ffffffff9b0ec1dd > #5 [ffff9965005dfed0] vfs_write at ffffffff9b0eed35 > #6 [ffff9965005dff00] ksys_write at ffffffff9b0eefd9 > #7 [ffff9965005dff38] do_syscall_64 at ffffffff9ae04315 > > inode_lock(inode); > > > and one 'ls' later on, to see whether the rest of the mount is available > (the test file is in the root, so we get blocked up on the directory > ->i_rwsem), so the entire mount is unavailable > > [0 00:36:26.473] [UN] PID: 9802 TASK: ffff8c691436ae80 CPU: 4 > COMMAND: "ls" > #0 [ffff996500393d28] __schedule at ffffffff9b6e6095 > #1 [ffff996500393db8] schedule at ffffffff9b6e64df > #2 [ffff996500393dc8] rwsem_down_read_slowpath at ffffffff9b6e9421 > #3 [ffff996500393e78] down_read_killable at ffffffff9b6e95e2 > #4 [ffff996500393e88] iterate_dir at ffffffff9b103c56 > #5 [ffff996500393ec8] ksys_getdents64 at ffffffff9b104b0c > #6 [ffff996500393f30] __x64_sys_getdents64 at ffffffff9b104bb6 > #7 [ffff996500393f38] do_syscall_64 at ffffffff9ae04315 > > in iterate_dir: > if (shared) > res = down_read_killable(&inode->i_rwsem); <<<< > else > res = down_write_killable(&inode->i_rwsem); > Reported-by: Frank Sorenson <sorenson@redhat.com> Reviewed-by: Pavel Shilovsky <pshilov@microsoft.com> Signed-off-by: Ronnie Sahlberg <lsahlber@redhat.com> Signed-off-by: Steve French <stfrench@microsoft.com>
2019-11-03 03:06:37 +00:00
struct workqueue_struct *fileinfo_put_wq;
CIFS: fix oplock break deadlocks When the final cifsFileInfo_put() is called from cifsiod and an oplock break work is queued, lockdep complains loudly: ============================================= [ INFO: possible recursive locking detected ] 4.11.0+ #21 Not tainted --------------------------------------------- kworker/0:2/78 is trying to acquire lock: ("cifsiod"){++++.+}, at: flush_work+0x215/0x350 but task is already holding lock: ("cifsiod"){++++.+}, at: process_one_work+0x255/0x8e0 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock("cifsiod"); lock("cifsiod"); *** DEADLOCK *** May be due to missing lock nesting notation 2 locks held by kworker/0:2/78: #0: ("cifsiod"){++++.+}, at: process_one_work+0x255/0x8e0 #1: ((&wdata->work)){+.+...}, at: process_one_work+0x255/0x8e0 stack backtrace: CPU: 0 PID: 78 Comm: kworker/0:2 Not tainted 4.11.0+ #21 Workqueue: cifsiod cifs_writev_complete Call Trace: dump_stack+0x85/0xc2 __lock_acquire+0x17dd/0x2260 ? match_held_lock+0x20/0x2b0 ? trace_hardirqs_off_caller+0x86/0x130 ? mark_lock+0xa6/0x920 lock_acquire+0xcc/0x260 ? lock_acquire+0xcc/0x260 ? flush_work+0x215/0x350 flush_work+0x236/0x350 ? flush_work+0x215/0x350 ? destroy_worker+0x170/0x170 __cancel_work_timer+0x17d/0x210 ? ___preempt_schedule+0x16/0x18 cancel_work_sync+0x10/0x20 cifsFileInfo_put+0x338/0x7f0 cifs_writedata_release+0x2a/0x40 ? cifs_writedata_release+0x2a/0x40 cifs_writev_complete+0x29d/0x850 ? preempt_count_sub+0x18/0xd0 process_one_work+0x304/0x8e0 worker_thread+0x9b/0x6a0 kthread+0x1b2/0x200 ? process_one_work+0x8e0/0x8e0 ? kthread_create_on_node+0x40/0x40 ret_from_fork+0x31/0x40 This is a real warning. Since the oplock is queued on the same workqueue this can deadlock if there is only one worker thread active for the workqueue (which will be the case during memory pressure when the rescuer thread is handling it). Furthermore, there is at least one other kind of hang possible due to the oplock break handling if there is only worker. (This can be reproduced without introducing memory pressure by having passing 1 for the max_active parameter of cifsiod.) cifs_oplock_break() can wait indefintely in the filemap_fdatawait() while the cifs_writev_complete() work is blocked: sysrq: SysRq : Show Blocked State task PC stack pid father kworker/0:1 D 0 16 2 0x00000000 Workqueue: cifsiod cifs_oplock_break Call Trace: __schedule+0x562/0xf40 ? mark_held_locks+0x4a/0xb0 schedule+0x57/0xe0 io_schedule+0x21/0x50 wait_on_page_bit+0x143/0x190 ? add_to_page_cache_lru+0x150/0x150 __filemap_fdatawait_range+0x134/0x190 ? do_writepages+0x51/0x70 filemap_fdatawait_range+0x14/0x30 filemap_fdatawait+0x3b/0x40 cifs_oplock_break+0x651/0x710 ? preempt_count_sub+0x18/0xd0 process_one_work+0x304/0x8e0 worker_thread+0x9b/0x6a0 kthread+0x1b2/0x200 ? process_one_work+0x8e0/0x8e0 ? kthread_create_on_node+0x40/0x40 ret_from_fork+0x31/0x40 dd D 0 683 171 0x00000000 Call Trace: __schedule+0x562/0xf40 ? mark_held_locks+0x29/0xb0 schedule+0x57/0xe0 io_schedule+0x21/0x50 wait_on_page_bit+0x143/0x190 ? add_to_page_cache_lru+0x150/0x150 __filemap_fdatawait_range+0x134/0x190 ? do_writepages+0x51/0x70 filemap_fdatawait_range+0x14/0x30 filemap_fdatawait+0x3b/0x40 filemap_write_and_wait+0x4e/0x70 cifs_flush+0x6a/0xb0 filp_close+0x52/0xa0 __close_fd+0xdc/0x150 SyS_close+0x33/0x60 entry_SYSCALL_64_fastpath+0x1f/0xbe Showing all locks held in the system: 2 locks held by kworker/0:1/16: #0: ("cifsiod"){.+.+.+}, at: process_one_work+0x255/0x8e0 #1: ((&cfile->oplock_break)){+.+.+.}, at: process_one_work+0x255/0x8e0 Showing busy workqueues and worker pools: workqueue cifsiod: flags=0xc pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=1/1 in-flight: 16:cifs_oplock_break delayed: cifs_writev_complete, cifs_echo_request pool 0: cpus=0 node=0 flags=0x0 nice=0 hung=0s workers=3 idle: 750 3 Fix these problems by creating a a new workqueue (with a rescuer) for the oplock break work. Signed-off-by: Rabin Vincent <rabinv@axis.com> Signed-off-by: Steve French <smfrench@gmail.com> CC: Stable <stable@vger.kernel.org>
2017-05-03 15:54:01 +00:00
struct workqueue_struct *cifsoplockd_wq;
struct workqueue_struct *deferredclose_wq;
__u32 cifs_lock_secret;
/*
* Bumps refcount for cifs super block.
* Note that it should be only called if a referece to VFS super block is
* already held, e.g. in open-type syscalls context. Otherwise it can race with
* atomic_dec_and_test in deactivate_locked_super.
*/
void
cifs_sb_active(struct super_block *sb)
{
struct cifs_sb_info *server = CIFS_SB(sb);
if (atomic_inc_return(&server->active) == 1)
atomic_inc(&sb->s_active);
}
void
cifs_sb_deactive(struct super_block *sb)
{
struct cifs_sb_info *server = CIFS_SB(sb);
if (atomic_dec_and_test(&server->active))
deactivate_super(sb);
}
static int
cifs_read_super(struct super_block *sb)
{
struct inode *inode;
struct cifs_sb_info *cifs_sb;
struct cifs_tcon *tcon;
struct timespec64 ts;
int rc = 0;
cifs_sb = CIFS_SB(sb);
tcon = cifs_sb_master_tcon(cifs_sb);
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIXACL)
Rename superblock flags (MS_xyz -> SB_xyz) This is a pure automated search-and-replace of the internal kernel superblock flags. The s_flags are now called SB_*, with the names and the values for the moment mirroring the MS_* flags that they're equivalent to. Note how the MS_xyz flags are the ones passed to the mount system call, while the SB_xyz flags are what we then use in sb->s_flags. The script to do this was: # places to look in; re security/*: it generally should *not* be # touched (that stuff parses mount(2) arguments directly), but # there are two places where we really deal with superblock flags. FILES="drivers/mtd drivers/staging/lustre fs ipc mm \ include/linux/fs.h include/uapi/linux/bfs_fs.h \ security/apparmor/apparmorfs.c security/apparmor/include/lib.h" # the list of MS_... constants SYMS="RDONLY NOSUID NODEV NOEXEC SYNCHRONOUS REMOUNT MANDLOCK \ DIRSYNC NOATIME NODIRATIME BIND MOVE REC VERBOSE SILENT \ POSIXACL UNBINDABLE PRIVATE SLAVE SHARED RELATIME KERNMOUNT \ I_VERSION STRICTATIME LAZYTIME SUBMOUNT NOREMOTELOCK NOSEC BORN \ ACTIVE NOUSER" SED_PROG= for i in $SYMS; do SED_PROG="$SED_PROG -e s/MS_$i/SB_$i/g"; done # we want files that contain at least one of MS_..., # with fs/namespace.c and fs/pnode.c excluded. L=$(for i in $SYMS; do git grep -w -l MS_$i $FILES; done| sort|uniq|grep -v '^fs/namespace.c'|grep -v '^fs/pnode.c') for f in $L; do sed -i $f $SED_PROG; done Requested-by: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-11-27 21:05:09 +00:00
sb->s_flags |= SB_POSIXACL;
if (tcon->snapshot_time)
sb->s_flags |= SB_RDONLY;
if (tcon->ses->capabilities & tcon->ses->server->vals->cap_large_files)
sb->s_maxbytes = MAX_LFS_FILESIZE;
else
sb->s_maxbytes = MAX_NON_LFS;
/*
* Some very old servers like DOS and OS/2 used 2 second granularity
* (while all current servers use 100ns granularity - see MS-DTYP)
* but 1 second is the maximum allowed granularity for the VFS
* so for old servers set time granularity to 1 second while for
* everything else (current servers) set it to 100ns.
*/
if ((tcon->ses->server->vals->protocol_id == SMB10_PROT_ID) &&
((tcon->ses->capabilities &
tcon->ses->server->vals->cap_nt_find) == 0) &&
!tcon->unix_ext) {
sb->s_time_gran = 1000000000; /* 1 second is max allowed gran */
ts = cnvrtDosUnixTm(cpu_to_le16(SMB_DATE_MIN), 0, 0);
sb->s_time_min = ts.tv_sec;
ts = cnvrtDosUnixTm(cpu_to_le16(SMB_DATE_MAX),
cpu_to_le16(SMB_TIME_MAX), 0);
sb->s_time_max = ts.tv_sec;
} else {
/*
* Almost every server, including all SMB2+, uses DCE TIME
* ie 100 nanosecond units, since 1601. See MS-DTYP and MS-FSCC
*/
sb->s_time_gran = 100;
ts = cifs_NTtimeToUnix(0);
sb->s_time_min = ts.tv_sec;
ts = cifs_NTtimeToUnix(cpu_to_le64(S64_MAX));
sb->s_time_max = ts.tv_sec;
}
sb->s_magic = CIFS_SUPER_MAGIC;
sb->s_op = &cifs_super_ops;
sb->s_xattr = cifs_xattr_handlers;
rc = super_setup_bdi(sb);
if (rc)
goto out_no_root;
/* tune readahead according to rsize if readahead size not set on mount */
if (cifs_sb->ctx->rsize == 0)
cifs_sb->ctx->rsize =
tcon->ses->server->ops->negotiate_rsize(tcon, cifs_sb->ctx);
if (cifs_sb->ctx->rasize)
sb->s_bdi->ra_pages = cifs_sb->ctx->rasize / PAGE_SIZE;
else
sb->s_bdi->ra_pages = 2 * (cifs_sb->ctx->rsize / PAGE_SIZE);
sb->s_blocksize = CIFS_MAX_MSGSIZE;
sb->s_blocksize_bits = 14; /* default 2**14 = CIFS_MAX_MSGSIZE */
inode = cifs_root_iget(sb);
if (IS_ERR(inode)) {
rc = PTR_ERR(inode);
goto out_no_root;
}
if (tcon->nocase)
sb->s_d_op = &cifs_ci_dentry_ops;
else
sb->s_d_op = &cifs_dentry_ops;
sb->s_root = d_make_root(inode);
if (!sb->s_root) {
rc = -ENOMEM;
goto out_no_root;
}
#ifdef CONFIG_CIFS_NFSD_EXPORT
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
cifs_dbg(FYI, "export ops supported\n");
sb->s_export_op = &cifs_export_ops;
}
#endif /* CONFIG_CIFS_NFSD_EXPORT */
return 0;
out_no_root:
cifs_dbg(VFS, "%s: get root inode failed\n", __func__);
return rc;
}
static void cifs_kill_sb(struct super_block *sb)
{
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
/*
* We ned to release all dentries for the cached directories
* before we kill the sb.
*/
if (cifs_sb->root) {
close_all_cached_dirs(cifs_sb);
/* finally release root dentry */
dput(cifs_sb->root);
cifs_sb->root = NULL;
}
kill_anon_super(sb);
cifs_umount(cifs_sb);
}
static int
cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct super_block *sb = dentry->d_sb;
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
struct TCP_Server_Info *server = tcon->ses->server;
unsigned int xid;
int rc = 0;
xid = get_xid();
if (le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength) > 0)
buf->f_namelen =
le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength);
else
buf->f_namelen = PATH_MAX;
buf->f_fsid.val[0] = tcon->vol_serial_number;
/* are using part of create time for more randomness, see man statfs */
buf->f_fsid.val[1] = (int)le64_to_cpu(tcon->vol_create_time);
buf->f_files = 0; /* undefined */
buf->f_ffree = 0; /* unlimited */
if (server->ops->queryfs)
rc = server->ops->queryfs(xid, tcon, cifs_sb, buf);
free_xid(xid);
return rc;
}
static long cifs_fallocate(struct file *file, int mode, loff_t off, loff_t len)
{
struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file);
struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
struct TCP_Server_Info *server = tcon->ses->server;
if (server->ops->fallocate)
return server->ops->fallocate(file, tcon, mode, off, len);
return -EOPNOTSUPP;
}
static int cifs_permission(struct mnt_idmap *idmap,
struct inode *inode, int mask)
{
struct cifs_sb_info *cifs_sb;
cifs_sb = CIFS_SB(inode->i_sb);
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) {
if ((mask & MAY_EXEC) && !execute_ok(inode))
return -EACCES;
else
return 0;
} else /* file mode might have been restricted at mount time
on the client (above and beyond ACL on servers) for
servers which do not support setting and viewing mode bits,
so allowing client to check permissions is useful */
return generic_permission(&nop_mnt_idmap, inode, mask);
}
static struct kmem_cache *cifs_inode_cachep;
static struct kmem_cache *cifs_req_cachep;
static struct kmem_cache *cifs_mid_cachep;
static struct kmem_cache *cifs_sm_req_cachep;
mempool_t *cifs_sm_req_poolp;
mempool_t *cifs_req_poolp;
mempool_t *cifs_mid_poolp;
static struct inode *
cifs_alloc_inode(struct super_block *sb)
{
struct cifsInodeInfo *cifs_inode;
cifs_inode = alloc_inode_sb(sb, cifs_inode_cachep, GFP_KERNEL);
if (!cifs_inode)
return NULL;
cifs_inode->cifsAttrs = 0x20; /* default */
cifs_inode->time = 0;
/*
* Until the file is open and we have gotten oplock info back from the
* server, can not assume caching of file data or metadata.
*/
cifs_set_oplock_level(cifs_inode, 0);
cifs_inode->flags = 0;
cifs: Wait for writebacks to complete before attempting write. Problem reported in Red Hat bz 1040329 for strict writes where we cache only when we hold oplock and write direct to the server when we don't. When we receive an oplock break, we first change the oplock value for the inode in cifsInodeInfo->oplock to indicate that we no longer hold the oplock before we enqueue a task to flush changes to the backing device. Once we have completed flushing the changes, we return the oplock to the server. There are 2 ways here where we can have data corruption 1) While we flush changes to the backing device as part of the oplock break, we can have processes write to the file. These writes check for the oplock, find none and attempt to write directly to the server. These direct writes made while we are flushing from cache could be overwritten by data being flushed from the cache causing data corruption. 2) While a thread runs in cifs_strict_writev, the machine could receive and process an oplock break after the thread has checked the oplock and found that it allows us to cache and before we have made changes to the cache. In that case, we end up with a dirty page in cache when we shouldn't have any. This will be flushed later and will overwrite all subsequent writes to the part of the file represented by this page. Before making any writes to the server, we need to confirm that we are not in the process of flushing data to the server and if we are, we should wait until the process is complete before we attempt the write. We should also wait for existing writes to complete before we process an oplock break request which changes oplock values. We add a version specific downgrade_oplock() operation to allow for differences in the oplock values set for the different smb versions. Cc: stable@vger.kernel.org Signed-off-by: Sachin Prabhu <sprabhu@redhat.com> Reviewed-by: Jeff Layton <jlayton@redhat.com> Reviewed-by: Pavel Shilovsky <piastry@etersoft.ru> Signed-off-by: Steve French <smfrench@gmail.com>
2014-03-11 16:11:47 +00:00
spin_lock_init(&cifs_inode->writers_lock);
cifs_inode->writers = 0;
netfs: Fix gcc-12 warning by embedding vfs inode in netfs_i_context While randstruct was satisfied with using an open-coded "void *" offset cast for the netfs_i_context <-> inode casting, __builtin_object_size() as used by FORTIFY_SOURCE was not as easily fooled. This was causing the following complaint[1] from gcc v12: In file included from include/linux/string.h:253, from include/linux/ceph/ceph_debug.h:7, from fs/ceph/inode.c:2: In function 'fortify_memset_chk', inlined from 'netfs_i_context_init' at include/linux/netfs.h:326:2, inlined from 'ceph_alloc_inode' at fs/ceph/inode.c:463:2: include/linux/fortify-string.h:242:25: warning: call to '__write_overflow_field' declared with attribute warning: detected write beyond size of field (1st parameter); maybe use struct_group()? [-Wattribute-warning] 242 | __write_overflow_field(p_size_field, size); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Fix this by embedding a struct inode into struct netfs_i_context (which should perhaps be renamed to struct netfs_inode). The struct inode vfs_inode fields are then removed from the 9p, afs, ceph and cifs inode structs and vfs_inode is then simply changed to "netfs.inode" in those filesystems. Further, rename netfs_i_context to netfs_inode, get rid of the netfs_inode() function that converted a netfs_i_context pointer to an inode pointer (that can now be done with &ctx->inode) and rename the netfs_i_context() function to netfs_inode() (which is now a wrapper around container_of()). Most of the changes were done with: perl -p -i -e 's/vfs_inode/netfs.inode/'g \ `git grep -l 'vfs_inode' -- fs/{9p,afs,ceph,cifs}/*.[ch]` Kees suggested doing it with a pair structure[2] and a special declarator to insert that into the network filesystem's inode wrapper[3], but I think it's cleaner to embed it - and then it doesn't matter if struct randomisation reorders things. Dave Chinner suggested using a filesystem-specific VFS_I() function in each filesystem to convert that filesystem's own inode wrapper struct into the VFS inode struct[4]. Version #2: - Fix a couple of missed name changes due to a disabled cifs option. - Rename nfs_i_context to nfs_inode - Use "netfs" instead of "nic" as the member name in per-fs inode wrapper structs. [ This also undoes commit 507160f46c55 ("netfs: gcc-12: temporarily disable '-Wattribute-warning' for now") that is no longer needed ] Fixes: bc899ee1c898 ("netfs: Add a netfs inode context") Reported-by: Jeff Layton <jlayton@kernel.org> Signed-off-by: David Howells <dhowells@redhat.com> Reviewed-by: Jeff Layton <jlayton@kernel.org> Reviewed-by: Kees Cook <keescook@chromium.org> Reviewed-by: Xiubo Li <xiubli@redhat.com> cc: Jonathan Corbet <corbet@lwn.net> cc: Eric Van Hensbergen <ericvh@gmail.com> cc: Latchesar Ionkov <lucho@ionkov.net> cc: Dominique Martinet <asmadeus@codewreck.org> cc: Christian Schoenebeck <linux_oss@crudebyte.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: Ilya Dryomov <idryomov@gmail.com> cc: Steve French <smfrench@gmail.com> cc: William Kucharski <william.kucharski@oracle.com> cc: "Matthew Wilcox (Oracle)" <willy@infradead.org> cc: Dave Chinner <david@fromorbit.com> cc: linux-doc@vger.kernel.org cc: v9fs-developer@lists.sourceforge.net cc: linux-afs@lists.infradead.org cc: ceph-devel@vger.kernel.org cc: linux-cifs@vger.kernel.org cc: samba-technical@lists.samba.org cc: linux-fsdevel@vger.kernel.org cc: linux-hardening@vger.kernel.org Link: https://lore.kernel.org/r/d2ad3a3d7bdd794c6efb562d2f2b655fb67756b9.camel@kernel.org/ [1] Link: https://lore.kernel.org/r/20220517210230.864239-1-keescook@chromium.org/ [2] Link: https://lore.kernel.org/r/20220518202212.2322058-1-keescook@chromium.org/ [3] Link: https://lore.kernel.org/r/20220524101205.GI2306852@dread.disaster.area/ [4] Link: https://lore.kernel.org/r/165296786831.3591209.12111293034669289733.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/165305805651.4094995.7763502506786714216.stgit@warthog.procyon.org.uk # v2 Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2022-06-09 20:46:04 +00:00
cifs_inode->netfs.inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */
cifs_inode->netfs.remote_i_size = 0;
cifs_inode->uniqueid = 0;
cifs_inode->createtime = 0;
cifs_inode->epoch = 0;
spin_lock_init(&cifs_inode->open_file_lock);
generate_random_uuid(cifs_inode->lease_key);
cifs_inode->symlink_target = NULL;
/*
* Can not set i_flags here - they get immediately overwritten to zero
* by the VFS.
*/
netfs: Fix gcc-12 warning by embedding vfs inode in netfs_i_context While randstruct was satisfied with using an open-coded "void *" offset cast for the netfs_i_context <-> inode casting, __builtin_object_size() as used by FORTIFY_SOURCE was not as easily fooled. This was causing the following complaint[1] from gcc v12: In file included from include/linux/string.h:253, from include/linux/ceph/ceph_debug.h:7, from fs/ceph/inode.c:2: In function 'fortify_memset_chk', inlined from 'netfs_i_context_init' at include/linux/netfs.h:326:2, inlined from 'ceph_alloc_inode' at fs/ceph/inode.c:463:2: include/linux/fortify-string.h:242:25: warning: call to '__write_overflow_field' declared with attribute warning: detected write beyond size of field (1st parameter); maybe use struct_group()? [-Wattribute-warning] 242 | __write_overflow_field(p_size_field, size); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Fix this by embedding a struct inode into struct netfs_i_context (which should perhaps be renamed to struct netfs_inode). The struct inode vfs_inode fields are then removed from the 9p, afs, ceph and cifs inode structs and vfs_inode is then simply changed to "netfs.inode" in those filesystems. Further, rename netfs_i_context to netfs_inode, get rid of the netfs_inode() function that converted a netfs_i_context pointer to an inode pointer (that can now be done with &ctx->inode) and rename the netfs_i_context() function to netfs_inode() (which is now a wrapper around container_of()). Most of the changes were done with: perl -p -i -e 's/vfs_inode/netfs.inode/'g \ `git grep -l 'vfs_inode' -- fs/{9p,afs,ceph,cifs}/*.[ch]` Kees suggested doing it with a pair structure[2] and a special declarator to insert that into the network filesystem's inode wrapper[3], but I think it's cleaner to embed it - and then it doesn't matter if struct randomisation reorders things. Dave Chinner suggested using a filesystem-specific VFS_I() function in each filesystem to convert that filesystem's own inode wrapper struct into the VFS inode struct[4]. Version #2: - Fix a couple of missed name changes due to a disabled cifs option. - Rename nfs_i_context to nfs_inode - Use "netfs" instead of "nic" as the member name in per-fs inode wrapper structs. [ This also undoes commit 507160f46c55 ("netfs: gcc-12: temporarily disable '-Wattribute-warning' for now") that is no longer needed ] Fixes: bc899ee1c898 ("netfs: Add a netfs inode context") Reported-by: Jeff Layton <jlayton@kernel.org> Signed-off-by: David Howells <dhowells@redhat.com> Reviewed-by: Jeff Layton <jlayton@kernel.org> Reviewed-by: Kees Cook <keescook@chromium.org> Reviewed-by: Xiubo Li <xiubli@redhat.com> cc: Jonathan Corbet <corbet@lwn.net> cc: Eric Van Hensbergen <ericvh@gmail.com> cc: Latchesar Ionkov <lucho@ionkov.net> cc: Dominique Martinet <asmadeus@codewreck.org> cc: Christian Schoenebeck <linux_oss@crudebyte.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: Ilya Dryomov <idryomov@gmail.com> cc: Steve French <smfrench@gmail.com> cc: William Kucharski <william.kucharski@oracle.com> cc: "Matthew Wilcox (Oracle)" <willy@infradead.org> cc: Dave Chinner <david@fromorbit.com> cc: linux-doc@vger.kernel.org cc: v9fs-developer@lists.sourceforge.net cc: linux-afs@lists.infradead.org cc: ceph-devel@vger.kernel.org cc: linux-cifs@vger.kernel.org cc: samba-technical@lists.samba.org cc: linux-fsdevel@vger.kernel.org cc: linux-hardening@vger.kernel.org Link: https://lore.kernel.org/r/d2ad3a3d7bdd794c6efb562d2f2b655fb67756b9.camel@kernel.org/ [1] Link: https://lore.kernel.org/r/20220517210230.864239-1-keescook@chromium.org/ [2] Link: https://lore.kernel.org/r/20220518202212.2322058-1-keescook@chromium.org/ [3] Link: https://lore.kernel.org/r/20220524101205.GI2306852@dread.disaster.area/ [4] Link: https://lore.kernel.org/r/165296786831.3591209.12111293034669289733.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/165305805651.4094995.7763502506786714216.stgit@warthog.procyon.org.uk # v2 Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2022-06-09 20:46:04 +00:00
/* cifs_inode->netfs.inode.i_flags = S_NOATIME | S_NOCMTIME; */
INIT_LIST_HEAD(&cifs_inode->openFileList);
INIT_LIST_HEAD(&cifs_inode->llist);
INIT_LIST_HEAD(&cifs_inode->deferred_closes);
spin_lock_init(&cifs_inode->deferred_lock);
netfs: Fix gcc-12 warning by embedding vfs inode in netfs_i_context While randstruct was satisfied with using an open-coded "void *" offset cast for the netfs_i_context <-> inode casting, __builtin_object_size() as used by FORTIFY_SOURCE was not as easily fooled. This was causing the following complaint[1] from gcc v12: In file included from include/linux/string.h:253, from include/linux/ceph/ceph_debug.h:7, from fs/ceph/inode.c:2: In function 'fortify_memset_chk', inlined from 'netfs_i_context_init' at include/linux/netfs.h:326:2, inlined from 'ceph_alloc_inode' at fs/ceph/inode.c:463:2: include/linux/fortify-string.h:242:25: warning: call to '__write_overflow_field' declared with attribute warning: detected write beyond size of field (1st parameter); maybe use struct_group()? [-Wattribute-warning] 242 | __write_overflow_field(p_size_field, size); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Fix this by embedding a struct inode into struct netfs_i_context (which should perhaps be renamed to struct netfs_inode). The struct inode vfs_inode fields are then removed from the 9p, afs, ceph and cifs inode structs and vfs_inode is then simply changed to "netfs.inode" in those filesystems. Further, rename netfs_i_context to netfs_inode, get rid of the netfs_inode() function that converted a netfs_i_context pointer to an inode pointer (that can now be done with &ctx->inode) and rename the netfs_i_context() function to netfs_inode() (which is now a wrapper around container_of()). Most of the changes were done with: perl -p -i -e 's/vfs_inode/netfs.inode/'g \ `git grep -l 'vfs_inode' -- fs/{9p,afs,ceph,cifs}/*.[ch]` Kees suggested doing it with a pair structure[2] and a special declarator to insert that into the network filesystem's inode wrapper[3], but I think it's cleaner to embed it - and then it doesn't matter if struct randomisation reorders things. Dave Chinner suggested using a filesystem-specific VFS_I() function in each filesystem to convert that filesystem's own inode wrapper struct into the VFS inode struct[4]. Version #2: - Fix a couple of missed name changes due to a disabled cifs option. - Rename nfs_i_context to nfs_inode - Use "netfs" instead of "nic" as the member name in per-fs inode wrapper structs. [ This also undoes commit 507160f46c55 ("netfs: gcc-12: temporarily disable '-Wattribute-warning' for now") that is no longer needed ] Fixes: bc899ee1c898 ("netfs: Add a netfs inode context") Reported-by: Jeff Layton <jlayton@kernel.org> Signed-off-by: David Howells <dhowells@redhat.com> Reviewed-by: Jeff Layton <jlayton@kernel.org> Reviewed-by: Kees Cook <keescook@chromium.org> Reviewed-by: Xiubo Li <xiubli@redhat.com> cc: Jonathan Corbet <corbet@lwn.net> cc: Eric Van Hensbergen <ericvh@gmail.com> cc: Latchesar Ionkov <lucho@ionkov.net> cc: Dominique Martinet <asmadeus@codewreck.org> cc: Christian Schoenebeck <linux_oss@crudebyte.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: Ilya Dryomov <idryomov@gmail.com> cc: Steve French <smfrench@gmail.com> cc: William Kucharski <william.kucharski@oracle.com> cc: "Matthew Wilcox (Oracle)" <willy@infradead.org> cc: Dave Chinner <david@fromorbit.com> cc: linux-doc@vger.kernel.org cc: v9fs-developer@lists.sourceforge.net cc: linux-afs@lists.infradead.org cc: ceph-devel@vger.kernel.org cc: linux-cifs@vger.kernel.org cc: samba-technical@lists.samba.org cc: linux-fsdevel@vger.kernel.org cc: linux-hardening@vger.kernel.org Link: https://lore.kernel.org/r/d2ad3a3d7bdd794c6efb562d2f2b655fb67756b9.camel@kernel.org/ [1] Link: https://lore.kernel.org/r/20220517210230.864239-1-keescook@chromium.org/ [2] Link: https://lore.kernel.org/r/20220518202212.2322058-1-keescook@chromium.org/ [3] Link: https://lore.kernel.org/r/20220524101205.GI2306852@dread.disaster.area/ [4] Link: https://lore.kernel.org/r/165296786831.3591209.12111293034669289733.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/165305805651.4094995.7763502506786714216.stgit@warthog.procyon.org.uk # v2 Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2022-06-09 20:46:04 +00:00
return &cifs_inode->netfs.inode;
}
static void
cifs_free_inode(struct inode *inode)
{
struct cifsInodeInfo *cinode = CIFS_I(inode);
if (S_ISLNK(inode->i_mode))
kfree(cinode->symlink_target);
kmem_cache_free(cifs_inode_cachep, cinode);
}
static void
cifs_evict_inode(struct inode *inode)
{
mm + fs: store shadow entries in page cache Reclaim will be leaving shadow entries in the page cache radix tree upon evicting the real page. As those pages are found from the LRU, an iput() can lead to the inode being freed concurrently. At this point, reclaim must no longer install shadow pages because the inode freeing code needs to ensure the page tree is really empty. Add an address_space flag, AS_EXITING, that the inode freeing code sets under the tree lock before doing the final truncate. Reclaim will check for this flag before installing shadow pages. Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Reviewed-by: Rik van Riel <riel@redhat.com> Reviewed-by: Minchan Kim <minchan@kernel.org> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Bob Liu <bob.liu@oracle.com> Cc: Christoph Hellwig <hch@infradead.org> Cc: Dave Chinner <david@fromorbit.com> Cc: Greg Thelen <gthelen@google.com> Cc: Hugh Dickins <hughd@google.com> Cc: Jan Kara <jack@suse.cz> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Luigi Semenzato <semenzato@google.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Metin Doslu <metin@citusdata.com> Cc: Michel Lespinasse <walken@google.com> Cc: Ozgun Erdogan <ozgun@citusdata.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Roman Gushchin <klamm@yandex-team.ru> Cc: Ryan Mallon <rmallon@gmail.com> Cc: Tejun Heo <tj@kernel.org> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-04-03 21:47:49 +00:00
truncate_inode_pages_final(&inode->i_data);
if (inode->i_state & I_PINNING_NETFS_WB)
cifs: Support fscache indexing rewrite Change the cifs filesystem to take account of the changes to fscache's indexing rewrite and reenable caching in cifs. The following changes have been made: (1) The fscache_netfs struct is no more, and there's no need to register the filesystem as a whole. (2) The session cookie is now an fscache_volume cookie, allocated with fscache_acquire_volume(). That takes three parameters: a string representing the "volume" in the index, a string naming the cache to use (or NULL) and a u64 that conveys coherency metadata for the volume. For cifs, I've made it render the volume name string as: "cifs,<ipaddress>,<sharename>" where the sharename has '/' characters replaced with ';'. This probably needs rethinking a bit as the total name could exceed the maximum filename component length. Further, the coherency data is currently just set to 0. It needs something else doing with it - I wonder if it would suffice simply to sum the resource_id, vol_create_time and vol_serial_number or maybe hash them. (3) The fscache_cookie_def is no more and needed information is passed directly to fscache_acquire_cookie(). The cache no longer calls back into the filesystem, but rather metadata changes are indicated at other times. fscache_acquire_cookie() is passed the same keying and coherency information as before. (4) The functions to set/reset cookies are removed and fscache_use_cookie() and fscache_unuse_cookie() are used instead. fscache_use_cookie() is passed a flag to indicate if the cookie is opened for writing. fscache_unuse_cookie() is passed updates for the metadata if we changed it (ie. if the file was opened for writing). These are called when the file is opened or closed. (5) cifs_setattr_*() are made to call fscache_resize() to change the size of the cache object. (6) The functions to read and write data are stubbed out pending a conversion to use netfslib. Changes ======= ver #8: - Abstract cache invalidation into a helper function. - Fix some checkpatch warnings[3]. ver #7: - Removed the accidentally added-back call to get the super cookie in cifs_root_iget(). - Fixed the right call to cifs_fscache_get_super_cookie() to take account of the "-o fsc" mount flag. ver #6: - Moved the change of gfpflags_allow_blocking() to current_is_kswapd() for cifs here. - Fixed one of the error paths in cifs_atomic_open() to jump around the call to use the cookie. - Fixed an additional successful return in the middle of cifs_open() to use the cookie on the way out. - Only get a volume cookie (and thus inode cookies) when "-o fsc" is supplied to mount. ver #5: - Fixed a couple of bits of cookie handling[2]: - The cookie should be released in cifs_evict_inode(), not cifsFileInfo_put_final(). The cookie needs to persist beyond file closure so that writepages will be able to write to it. - fscache_use_cookie() needs to be called in cifs_atomic_open() as it is for cifs_open(). ver #4: - Fixed the use of sizeof with memset. - tcon->vol_create_time is __le64 so doesn't need cpu_to_le64(). ver #3: - Canonicalise the cifs coherency data to make the cache portable. - Set volume coherency data. ver #2: - Use gfpflags_allow_blocking() rather than using flag directly. - Upgraded to -rc4 to allow for upstream changes[1]. - fscache_acquire_volume() now returns errors. Signed-off-by: David Howells <dhowells@redhat.com> Acked-by: Jeff Layton <jlayton@kernel.org> cc: Steve French <smfrench@gmail.com> cc: Shyam Prasad N <nspmangalore@gmail.com> cc: linux-cifs@vger.kernel.org cc: linux-cachefs@redhat.com Link: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=23b55d673d7527b093cd97b7c217c82e70cd1af0 [1] Link: https://lore.kernel.org/r/3419813.1641592362@warthog.procyon.org.uk/ [2] Link: https://lore.kernel.org/r/CAH2r5muTanw9pJqzAHd01d9A8keeChkzGsCEH6=0rHutVLAF-A@mail.gmail.com/ [3] Link: https://lore.kernel.org/r/163819671009.215744.11230627184193298714.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/163906982979.143852.10672081929614953210.stgit@warthog.procyon.org.uk/ # v2 Link: https://lore.kernel.org/r/163967187187.1823006.247415138444991444.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/164021579335.640689.2681324337038770579.stgit@warthog.procyon.org.uk/ # v4 Link: https://lore.kernel.org/r/3462849.1641593783@warthog.procyon.org.uk/ # v5 Link: https://lore.kernel.org/r/1318953.1642024578@warthog.procyon.org.uk/ # v6 Signed-off-by: Steve French <stfrench@microsoft.com>
2020-11-17 15:56:59 +00:00
cifs_fscache_unuse_inode_cookie(inode, true);
cifs_fscache_release_inode_cookie(inode);
clear_inode(inode);
}
static void
cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server)
{
struct sockaddr_in *sa = (struct sockaddr_in *) &server->dstaddr;
struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *) &server->dstaddr;
seq_puts(s, ",addr=");
switch (server->dstaddr.ss_family) {
case AF_INET:
seq_printf(s, "%pI4", &sa->sin_addr.s_addr);
break;
case AF_INET6:
seq_printf(s, "%pI6", &sa6->sin6_addr.s6_addr);
if (sa6->sin6_scope_id)
seq_printf(s, "%%%u", sa6->sin6_scope_id);
break;
default:
seq_puts(s, "(unknown)");
}
if (server->rdma)
seq_puts(s, ",rdma");
}
static void
cifs_show_security(struct seq_file *s, struct cifs_ses *ses)
{
if (ses->sectype == Unspecified) {
if (ses->user_name == NULL)
seq_puts(s, ",sec=none");
return;
}
seq_puts(s, ",sec=");
switch (ses->sectype) {
case NTLMv2:
seq_puts(s, "ntlmv2");
break;
case Kerberos:
seq_puts(s, "krb5");
break;
case RawNTLMSSP:
seq_puts(s, "ntlmssp");
break;
default:
/* shouldn't ever happen */
seq_puts(s, "unknown");
break;
}
if (ses->sign)
seq_puts(s, "i");
if (ses->sectype == Kerberos)
seq_printf(s, ",cruid=%u",
from_kuid_munged(&init_user_ns, ses->cred_uid));
}
static void
cifs_show_cache_flavor(struct seq_file *s, struct cifs_sb_info *cifs_sb)
{
seq_puts(s, ",cache=");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO)
seq_puts(s, "strict");
else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO)
seq_puts(s, "none");
else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RW_CACHE)
seq_puts(s, "singleclient"); /* assume only one client access */
else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RO_CACHE)
seq_puts(s, "ro"); /* read only caching assumed */
else
seq_puts(s, "loose");
}
/*
* cifs_show_devname() is used so we show the mount device name with correct
* format (e.g. forward slashes vs. back slashes) in /proc/mounts
*/
static int cifs_show_devname(struct seq_file *m, struct dentry *root)
{
struct cifs_sb_info *cifs_sb = CIFS_SB(root->d_sb);
char *devname = kstrdup(cifs_sb->ctx->source, GFP_KERNEL);
if (devname == NULL)
seq_puts(m, "none");
else {
convert_delimiter(devname, '/');
/* escape all spaces in share names */
seq_escape(m, devname, " \t");
kfree(devname);
}
return 0;
}
/*
* cifs_show_options() is for displaying mount options in /proc/mounts.
* Not all settable options are displayed but most of the important
* ones are.
*/
static int
cifs_show_options(struct seq_file *s, struct dentry *root)
{
struct cifs_sb_info *cifs_sb = CIFS_SB(root->d_sb);
struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
struct sockaddr *srcaddr;
srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr;
fs: create and use seq_show_option for escaping Many file systems that implement the show_options hook fail to correctly escape their output which could lead to unescaped characters (e.g. new lines) leaking into /proc/mounts and /proc/[pid]/mountinfo files. This could lead to confusion, spoofed entries (resulting in things like systemd issuing false d-bus "mount" notifications), and who knows what else. This looks like it would only be the root user stepping on themselves, but it's possible weird things could happen in containers or in other situations with delegated mount privileges. Here's an example using overlay with setuid fusermount trusting the contents of /proc/mounts (via the /etc/mtab symlink). Imagine the use of "sudo" is something more sneaky: $ BASE="ovl" $ MNT="$BASE/mnt" $ LOW="$BASE/lower" $ UP="$BASE/upper" $ WORK="$BASE/work/ 0 0 none /proc fuse.pwn user_id=1000" $ mkdir -p "$LOW" "$UP" "$WORK" $ sudo mount -t overlay -o "lowerdir=$LOW,upperdir=$UP,workdir=$WORK" none /mnt $ cat /proc/mounts none /root/ovl/mnt overlay rw,relatime,lowerdir=ovl/lower,upperdir=ovl/upper,workdir=ovl/work/ 0 0 none /proc fuse.pwn user_id=1000 0 0 $ fusermount -u /proc $ cat /proc/mounts cat: /proc/mounts: No such file or directory This fixes the problem by adding new seq_show_option and seq_show_option_n helpers, and updating the vulnerable show_option handlers to use them as needed. Some, like SELinux, need to be open coded due to unusual existing escape mechanisms. [akpm@linux-foundation.org: add lost chunk, per Kees] [keescook@chromium.org: seq_show_option should be using const parameters] Signed-off-by: Kees Cook <keescook@chromium.org> Acked-by: Serge Hallyn <serge.hallyn@canonical.com> Acked-by: Jan Kara <jack@suse.com> Acked-by: Paul Moore <paul@paul-moore.com> Cc: J. R. Okajima <hooanon05g@gmail.com> Signed-off-by: Kees Cook <keescook@chromium.org> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-09-04 22:44:57 +00:00
seq_show_option(s, "vers", tcon->ses->server->vals->version_string);
cifs_show_security(s, tcon->ses);
cifs_show_cache_flavor(s, cifs_sb);
if (tcon->no_lease)
seq_puts(s, ",nolease");
if (cifs_sb->ctx->multiuser)
seq_puts(s, ",multiuser");
else if (tcon->ses->user_name)
fs: create and use seq_show_option for escaping Many file systems that implement the show_options hook fail to correctly escape their output which could lead to unescaped characters (e.g. new lines) leaking into /proc/mounts and /proc/[pid]/mountinfo files. This could lead to confusion, spoofed entries (resulting in things like systemd issuing false d-bus "mount" notifications), and who knows what else. This looks like it would only be the root user stepping on themselves, but it's possible weird things could happen in containers or in other situations with delegated mount privileges. Here's an example using overlay with setuid fusermount trusting the contents of /proc/mounts (via the /etc/mtab symlink). Imagine the use of "sudo" is something more sneaky: $ BASE="ovl" $ MNT="$BASE/mnt" $ LOW="$BASE/lower" $ UP="$BASE/upper" $ WORK="$BASE/work/ 0 0 none /proc fuse.pwn user_id=1000" $ mkdir -p "$LOW" "$UP" "$WORK" $ sudo mount -t overlay -o "lowerdir=$LOW,upperdir=$UP,workdir=$WORK" none /mnt $ cat /proc/mounts none /root/ovl/mnt overlay rw,relatime,lowerdir=ovl/lower,upperdir=ovl/upper,workdir=ovl/work/ 0 0 none /proc fuse.pwn user_id=1000 0 0 $ fusermount -u /proc $ cat /proc/mounts cat: /proc/mounts: No such file or directory This fixes the problem by adding new seq_show_option and seq_show_option_n helpers, and updating the vulnerable show_option handlers to use them as needed. Some, like SELinux, need to be open coded due to unusual existing escape mechanisms. [akpm@linux-foundation.org: add lost chunk, per Kees] [keescook@chromium.org: seq_show_option should be using const parameters] Signed-off-by: Kees Cook <keescook@chromium.org> Acked-by: Serge Hallyn <serge.hallyn@canonical.com> Acked-by: Jan Kara <jack@suse.com> Acked-by: Paul Moore <paul@paul-moore.com> Cc: J. R. Okajima <hooanon05g@gmail.com> Signed-off-by: Kees Cook <keescook@chromium.org> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-09-04 22:44:57 +00:00
seq_show_option(s, "username", tcon->ses->user_name);
if (tcon->ses->domainName && tcon->ses->domainName[0] != 0)
fs: create and use seq_show_option for escaping Many file systems that implement the show_options hook fail to correctly escape their output which could lead to unescaped characters (e.g. new lines) leaking into /proc/mounts and /proc/[pid]/mountinfo files. This could lead to confusion, spoofed entries (resulting in things like systemd issuing false d-bus "mount" notifications), and who knows what else. This looks like it would only be the root user stepping on themselves, but it's possible weird things could happen in containers or in other situations with delegated mount privileges. Here's an example using overlay with setuid fusermount trusting the contents of /proc/mounts (via the /etc/mtab symlink). Imagine the use of "sudo" is something more sneaky: $ BASE="ovl" $ MNT="$BASE/mnt" $ LOW="$BASE/lower" $ UP="$BASE/upper" $ WORK="$BASE/work/ 0 0 none /proc fuse.pwn user_id=1000" $ mkdir -p "$LOW" "$UP" "$WORK" $ sudo mount -t overlay -o "lowerdir=$LOW,upperdir=$UP,workdir=$WORK" none /mnt $ cat /proc/mounts none /root/ovl/mnt overlay rw,relatime,lowerdir=ovl/lower,upperdir=ovl/upper,workdir=ovl/work/ 0 0 none /proc fuse.pwn user_id=1000 0 0 $ fusermount -u /proc $ cat /proc/mounts cat: /proc/mounts: No such file or directory This fixes the problem by adding new seq_show_option and seq_show_option_n helpers, and updating the vulnerable show_option handlers to use them as needed. Some, like SELinux, need to be open coded due to unusual existing escape mechanisms. [akpm@linux-foundation.org: add lost chunk, per Kees] [keescook@chromium.org: seq_show_option should be using const parameters] Signed-off-by: Kees Cook <keescook@chromium.org> Acked-by: Serge Hallyn <serge.hallyn@canonical.com> Acked-by: Jan Kara <jack@suse.com> Acked-by: Paul Moore <paul@paul-moore.com> Cc: J. R. Okajima <hooanon05g@gmail.com> Signed-off-by: Kees Cook <keescook@chromium.org> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-09-04 22:44:57 +00:00
seq_show_option(s, "domain", tcon->ses->domainName);
if (srcaddr->sa_family != AF_UNSPEC) {
struct sockaddr_in *saddr4;
struct sockaddr_in6 *saddr6;
saddr4 = (struct sockaddr_in *)srcaddr;
saddr6 = (struct sockaddr_in6 *)srcaddr;
if (srcaddr->sa_family == AF_INET6)
seq_printf(s, ",srcaddr=%pI6c",
&saddr6->sin6_addr);
else if (srcaddr->sa_family == AF_INET)
seq_printf(s, ",srcaddr=%pI4",
&saddr4->sin_addr.s_addr);
else
seq_printf(s, ",srcaddr=BAD-AF:%i",
(int)(srcaddr->sa_family));
}
seq_printf(s, ",uid=%u",
from_kuid_munged(&init_user_ns, cifs_sb->ctx->linux_uid));
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)
seq_puts(s, ",forceuid");
else
seq_puts(s, ",noforceuid");
seq_printf(s, ",gid=%u",
from_kgid_munged(&init_user_ns, cifs_sb->ctx->linux_gid));
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)
seq_puts(s, ",forcegid");
else
seq_puts(s, ",noforcegid");
cifs_show_address(s, tcon->ses->server);
if (!tcon->unix_ext)
seq_printf(s, ",file_mode=0%ho,dir_mode=0%ho",
cifs_sb->ctx->file_mode,
cifs_sb->ctx->dir_mode);
if (cifs_sb->ctx->iocharset)
seq_printf(s, ",iocharset=%s", cifs_sb->ctx->iocharset);
if (tcon->seal)
seq_puts(s, ",seal");
else if (tcon->ses->server->ignore_signature)
seq_puts(s, ",signloosely");
if (tcon->nocase)
seq_puts(s, ",nocase");
if (tcon->nodelete)
seq_puts(s, ",nodelete");
if (cifs_sb->ctx->no_sparse)
seq_puts(s, ",nosparse");
if (tcon->local_lease)
seq_puts(s, ",locallease");
if (tcon->retry)
seq_puts(s, ",hard");
else
seq_puts(s, ",soft");
if (tcon->use_persistent)
seq_puts(s, ",persistenthandles");
else if (tcon->use_resilient)
seq_puts(s, ",resilienthandles");
if (tcon->posix_extensions)
seq_puts(s, ",posix");
else if (tcon->unix_ext)
seq_puts(s, ",unix");
else
seq_puts(s, ",nounix");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS)
seq_puts(s, ",nodfs");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)
seq_puts(s, ",posixpaths");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)
seq_puts(s, ",setuids");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UID_FROM_ACL)
seq_puts(s, ",idsfromsid");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
seq_puts(s, ",serverino");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
seq_puts(s, ",rwpidforward");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL)
seq_puts(s, ",forcemand");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
seq_puts(s, ",nouser_xattr");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR)
seq_puts(s, ",mapchars");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SFM_CHR)
seq_puts(s, ",mapposix");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)
seq_puts(s, ",sfu");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
seq_puts(s, ",nobrl");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_HANDLE_CACHE)
seq_puts(s, ",nohandlecache");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MODE_FROM_SID)
seq_puts(s, ",modefromsid");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL)
seq_puts(s, ",cifsacl");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)
seq_puts(s, ",dynperm");
Rename superblock flags (MS_xyz -> SB_xyz) This is a pure automated search-and-replace of the internal kernel superblock flags. The s_flags are now called SB_*, with the names and the values for the moment mirroring the MS_* flags that they're equivalent to. Note how the MS_xyz flags are the ones passed to the mount system call, while the SB_xyz flags are what we then use in sb->s_flags. The script to do this was: # places to look in; re security/*: it generally should *not* be # touched (that stuff parses mount(2) arguments directly), but # there are two places where we really deal with superblock flags. FILES="drivers/mtd drivers/staging/lustre fs ipc mm \ include/linux/fs.h include/uapi/linux/bfs_fs.h \ security/apparmor/apparmorfs.c security/apparmor/include/lib.h" # the list of MS_... constants SYMS="RDONLY NOSUID NODEV NOEXEC SYNCHRONOUS REMOUNT MANDLOCK \ DIRSYNC NOATIME NODIRATIME BIND MOVE REC VERBOSE SILENT \ POSIXACL UNBINDABLE PRIVATE SLAVE SHARED RELATIME KERNMOUNT \ I_VERSION STRICTATIME LAZYTIME SUBMOUNT NOREMOTELOCK NOSEC BORN \ ACTIVE NOUSER" SED_PROG= for i in $SYMS; do SED_PROG="$SED_PROG -e s/MS_$i/SB_$i/g"; done # we want files that contain at least one of MS_..., # with fs/namespace.c and fs/pnode.c excluded. L=$(for i in $SYMS; do git grep -w -l MS_$i $FILES; done| sort|uniq|grep -v '^fs/namespace.c'|grep -v '^fs/pnode.c') for f in $L; do sed -i $f $SED_PROG; done Requested-by: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-11-27 21:05:09 +00:00
if (root->d_sb->s_flags & SB_POSIXACL)
seq_puts(s, ",acl");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
seq_puts(s, ",mfsymlinks");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE)
seq_puts(s, ",fsc");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)
seq_puts(s, ",nostrictsync");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM)
seq_puts(s, ",noperm");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPUID)
seq_printf(s, ",backupuid=%u",
from_kuid_munged(&init_user_ns,
cifs_sb->ctx->backupuid));
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPGID)
seq_printf(s, ",backupgid=%u",
from_kgid_munged(&init_user_ns,
cifs_sb->ctx->backupgid));
seq_printf(s, ",rsize=%u", cifs_sb->ctx->rsize);
seq_printf(s, ",wsize=%u", cifs_sb->ctx->wsize);
seq_printf(s, ",bsize=%u", cifs_sb->ctx->bsize);
if (cifs_sb->ctx->rasize)
seq_printf(s, ",rasize=%u", cifs_sb->ctx->rasize);
if (tcon->ses->server->min_offload)
seq_printf(s, ",esize=%u", tcon->ses->server->min_offload);
if (tcon->ses->server->retrans)
seq_printf(s, ",retrans=%u", tcon->ses->server->retrans);
seq_printf(s, ",echo_interval=%lu",
tcon->ses->server->echo_interval / HZ);
/* Only display the following if overridden on mount */
if (tcon->ses->server->max_credits != SMB2_MAX_CREDITS_AVAILABLE)
seq_printf(s, ",max_credits=%u", tcon->ses->server->max_credits);
if (tcon->ses->server->tcp_nodelay)
seq_puts(s, ",tcpnodelay");
if (tcon->ses->server->noautotune)
seq_puts(s, ",noautotune");
if (tcon->ses->server->noblocksnd)
seq_puts(s, ",noblocksend");
if (tcon->ses->server->nosharesock)
seq_puts(s, ",nosharesock");
if (tcon->snapshot_time)
seq_printf(s, ",snapshot=%llu", tcon->snapshot_time);
if (tcon->handle_timeout)
seq_printf(s, ",handletimeout=%u", tcon->handle_timeout);
if (tcon->max_cached_dirs != MAX_CACHED_FIDS)
seq_printf(s, ",max_cached_dirs=%u", tcon->max_cached_dirs);
/*
* Display file and directory attribute timeout in seconds.
* If file and directory attribute timeout the same then actimeo
* was likely specified on mount
*/
if (cifs_sb->ctx->acdirmax == cifs_sb->ctx->acregmax)
seq_printf(s, ",actimeo=%lu", cifs_sb->ctx->acregmax / HZ);
else {
seq_printf(s, ",acdirmax=%lu", cifs_sb->ctx->acdirmax / HZ);
seq_printf(s, ",acregmax=%lu", cifs_sb->ctx->acregmax / HZ);
}
seq_printf(s, ",closetimeo=%lu", cifs_sb->ctx->closetimeo / HZ);
if (tcon->ses->chan_max > 1)
seq_printf(s, ",multichannel,max_channels=%zu",
tcon->ses->chan_max);
if (tcon->use_witness)
seq_puts(s, ",witness");
return 0;
}
static void cifs_umount_begin(struct super_block *sb)
{
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
struct cifs_tcon *tcon;
if (cifs_sb == NULL)
return;
tcon = cifs_sb_master_tcon(cifs_sb);
spin_lock(&cifs_tcp_ses_lock);
spin_lock(&tcon->tc_lock);
if ((tcon->tc_count > 1) || (tcon->status == TID_EXITING)) {
/* we have other mounts to same share or we have
already tried to umount this and woken up
all waiting network requests, nothing to do */
spin_unlock(&tcon->tc_lock);
spin_unlock(&cifs_tcp_ses_lock);
return;
}
/*
* can not set tcon->status to TID_EXITING yet since we don't know if umount -f will
* fail later (e.g. due to open files). TID_EXITING will be set just before tdis req sent
*/
spin_unlock(&tcon->tc_lock);
spin_unlock(&cifs_tcp_ses_lock);
cifs_close_all_deferred_files(tcon);
/* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */
2005-11-09 23:21:09 +00:00
/* cancel_notify_requests(tcon); */
if (tcon->ses && tcon->ses->server) {
cifs_dbg(FYI, "wake up tasks now - umount begin not complete\n");
wake_up_all(&tcon->ses->server->request_q);
wake_up_all(&tcon->ses->server->response_q);
msleep(1); /* yield */
/* we have to kick the requests once more */
wake_up_all(&tcon->ses->server->response_q);
msleep(1);
}
return;
}
static int cifs_freeze(struct super_block *sb)
{
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
struct cifs_tcon *tcon;
if (cifs_sb == NULL)
return 0;
tcon = cifs_sb_master_tcon(cifs_sb);
cifs_close_all_deferred_files(tcon);
return 0;
}
#ifdef CONFIG_CIFS_STATS2
static int cifs_show_stats(struct seq_file *s, struct dentry *root)
{
/* BB FIXME */
return 0;
}
#endif
cifs: Support fscache indexing rewrite Change the cifs filesystem to take account of the changes to fscache's indexing rewrite and reenable caching in cifs. The following changes have been made: (1) The fscache_netfs struct is no more, and there's no need to register the filesystem as a whole. (2) The session cookie is now an fscache_volume cookie, allocated with fscache_acquire_volume(). That takes three parameters: a string representing the "volume" in the index, a string naming the cache to use (or NULL) and a u64 that conveys coherency metadata for the volume. For cifs, I've made it render the volume name string as: "cifs,<ipaddress>,<sharename>" where the sharename has '/' characters replaced with ';'. This probably needs rethinking a bit as the total name could exceed the maximum filename component length. Further, the coherency data is currently just set to 0. It needs something else doing with it - I wonder if it would suffice simply to sum the resource_id, vol_create_time and vol_serial_number or maybe hash them. (3) The fscache_cookie_def is no more and needed information is passed directly to fscache_acquire_cookie(). The cache no longer calls back into the filesystem, but rather metadata changes are indicated at other times. fscache_acquire_cookie() is passed the same keying and coherency information as before. (4) The functions to set/reset cookies are removed and fscache_use_cookie() and fscache_unuse_cookie() are used instead. fscache_use_cookie() is passed a flag to indicate if the cookie is opened for writing. fscache_unuse_cookie() is passed updates for the metadata if we changed it (ie. if the file was opened for writing). These are called when the file is opened or closed. (5) cifs_setattr_*() are made to call fscache_resize() to change the size of the cache object. (6) The functions to read and write data are stubbed out pending a conversion to use netfslib. Changes ======= ver #8: - Abstract cache invalidation into a helper function. - Fix some checkpatch warnings[3]. ver #7: - Removed the accidentally added-back call to get the super cookie in cifs_root_iget(). - Fixed the right call to cifs_fscache_get_super_cookie() to take account of the "-o fsc" mount flag. ver #6: - Moved the change of gfpflags_allow_blocking() to current_is_kswapd() for cifs here. - Fixed one of the error paths in cifs_atomic_open() to jump around the call to use the cookie. - Fixed an additional successful return in the middle of cifs_open() to use the cookie on the way out. - Only get a volume cookie (and thus inode cookies) when "-o fsc" is supplied to mount. ver #5: - Fixed a couple of bits of cookie handling[2]: - The cookie should be released in cifs_evict_inode(), not cifsFileInfo_put_final(). The cookie needs to persist beyond file closure so that writepages will be able to write to it. - fscache_use_cookie() needs to be called in cifs_atomic_open() as it is for cifs_open(). ver #4: - Fixed the use of sizeof with memset. - tcon->vol_create_time is __le64 so doesn't need cpu_to_le64(). ver #3: - Canonicalise the cifs coherency data to make the cache portable. - Set volume coherency data. ver #2: - Use gfpflags_allow_blocking() rather than using flag directly. - Upgraded to -rc4 to allow for upstream changes[1]. - fscache_acquire_volume() now returns errors. Signed-off-by: David Howells <dhowells@redhat.com> Acked-by: Jeff Layton <jlayton@kernel.org> cc: Steve French <smfrench@gmail.com> cc: Shyam Prasad N <nspmangalore@gmail.com> cc: linux-cifs@vger.kernel.org cc: linux-cachefs@redhat.com Link: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=23b55d673d7527b093cd97b7c217c82e70cd1af0 [1] Link: https://lore.kernel.org/r/3419813.1641592362@warthog.procyon.org.uk/ [2] Link: https://lore.kernel.org/r/CAH2r5muTanw9pJqzAHd01d9A8keeChkzGsCEH6=0rHutVLAF-A@mail.gmail.com/ [3] Link: https://lore.kernel.org/r/163819671009.215744.11230627184193298714.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/163906982979.143852.10672081929614953210.stgit@warthog.procyon.org.uk/ # v2 Link: https://lore.kernel.org/r/163967187187.1823006.247415138444991444.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/164021579335.640689.2681324337038770579.stgit@warthog.procyon.org.uk/ # v4 Link: https://lore.kernel.org/r/3462849.1641593783@warthog.procyon.org.uk/ # v5 Link: https://lore.kernel.org/r/1318953.1642024578@warthog.procyon.org.uk/ # v6 Signed-off-by: Steve French <stfrench@microsoft.com>
2020-11-17 15:56:59 +00:00
static int cifs_write_inode(struct inode *inode, struct writeback_control *wbc)
{
return netfs_unpin_writeback(inode, wbc);
cifs: Support fscache indexing rewrite Change the cifs filesystem to take account of the changes to fscache's indexing rewrite and reenable caching in cifs. The following changes have been made: (1) The fscache_netfs struct is no more, and there's no need to register the filesystem as a whole. (2) The session cookie is now an fscache_volume cookie, allocated with fscache_acquire_volume(). That takes three parameters: a string representing the "volume" in the index, a string naming the cache to use (or NULL) and a u64 that conveys coherency metadata for the volume. For cifs, I've made it render the volume name string as: "cifs,<ipaddress>,<sharename>" where the sharename has '/' characters replaced with ';'. This probably needs rethinking a bit as the total name could exceed the maximum filename component length. Further, the coherency data is currently just set to 0. It needs something else doing with it - I wonder if it would suffice simply to sum the resource_id, vol_create_time and vol_serial_number or maybe hash them. (3) The fscache_cookie_def is no more and needed information is passed directly to fscache_acquire_cookie(). The cache no longer calls back into the filesystem, but rather metadata changes are indicated at other times. fscache_acquire_cookie() is passed the same keying and coherency information as before. (4) The functions to set/reset cookies are removed and fscache_use_cookie() and fscache_unuse_cookie() are used instead. fscache_use_cookie() is passed a flag to indicate if the cookie is opened for writing. fscache_unuse_cookie() is passed updates for the metadata if we changed it (ie. if the file was opened for writing). These are called when the file is opened or closed. (5) cifs_setattr_*() are made to call fscache_resize() to change the size of the cache object. (6) The functions to read and write data are stubbed out pending a conversion to use netfslib. Changes ======= ver #8: - Abstract cache invalidation into a helper function. - Fix some checkpatch warnings[3]. ver #7: - Removed the accidentally added-back call to get the super cookie in cifs_root_iget(). - Fixed the right call to cifs_fscache_get_super_cookie() to take account of the "-o fsc" mount flag. ver #6: - Moved the change of gfpflags_allow_blocking() to current_is_kswapd() for cifs here. - Fixed one of the error paths in cifs_atomic_open() to jump around the call to use the cookie. - Fixed an additional successful return in the middle of cifs_open() to use the cookie on the way out. - Only get a volume cookie (and thus inode cookies) when "-o fsc" is supplied to mount. ver #5: - Fixed a couple of bits of cookie handling[2]: - The cookie should be released in cifs_evict_inode(), not cifsFileInfo_put_final(). The cookie needs to persist beyond file closure so that writepages will be able to write to it. - fscache_use_cookie() needs to be called in cifs_atomic_open() as it is for cifs_open(). ver #4: - Fixed the use of sizeof with memset. - tcon->vol_create_time is __le64 so doesn't need cpu_to_le64(). ver #3: - Canonicalise the cifs coherency data to make the cache portable. - Set volume coherency data. ver #2: - Use gfpflags_allow_blocking() rather than using flag directly. - Upgraded to -rc4 to allow for upstream changes[1]. - fscache_acquire_volume() now returns errors. Signed-off-by: David Howells <dhowells@redhat.com> Acked-by: Jeff Layton <jlayton@kernel.org> cc: Steve French <smfrench@gmail.com> cc: Shyam Prasad N <nspmangalore@gmail.com> cc: linux-cifs@vger.kernel.org cc: linux-cachefs@redhat.com Link: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=23b55d673d7527b093cd97b7c217c82e70cd1af0 [1] Link: https://lore.kernel.org/r/3419813.1641592362@warthog.procyon.org.uk/ [2] Link: https://lore.kernel.org/r/CAH2r5muTanw9pJqzAHd01d9A8keeChkzGsCEH6=0rHutVLAF-A@mail.gmail.com/ [3] Link: https://lore.kernel.org/r/163819671009.215744.11230627184193298714.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/163906982979.143852.10672081929614953210.stgit@warthog.procyon.org.uk/ # v2 Link: https://lore.kernel.org/r/163967187187.1823006.247415138444991444.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/164021579335.640689.2681324337038770579.stgit@warthog.procyon.org.uk/ # v4 Link: https://lore.kernel.org/r/3462849.1641593783@warthog.procyon.org.uk/ # v5 Link: https://lore.kernel.org/r/1318953.1642024578@warthog.procyon.org.uk/ # v6 Signed-off-by: Steve French <stfrench@microsoft.com>
2020-11-17 15:56:59 +00:00
}
static int cifs_drop_inode(struct inode *inode)
{
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
/* no serverino => unconditional eviction */
return !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) ||
generic_drop_inode(inode);
}
static const struct super_operations cifs_super_ops = {
.statfs = cifs_statfs,
.alloc_inode = cifs_alloc_inode,
cifs: Support fscache indexing rewrite Change the cifs filesystem to take account of the changes to fscache's indexing rewrite and reenable caching in cifs. The following changes have been made: (1) The fscache_netfs struct is no more, and there's no need to register the filesystem as a whole. (2) The session cookie is now an fscache_volume cookie, allocated with fscache_acquire_volume(). That takes three parameters: a string representing the "volume" in the index, a string naming the cache to use (or NULL) and a u64 that conveys coherency metadata for the volume. For cifs, I've made it render the volume name string as: "cifs,<ipaddress>,<sharename>" where the sharename has '/' characters replaced with ';'. This probably needs rethinking a bit as the total name could exceed the maximum filename component length. Further, the coherency data is currently just set to 0. It needs something else doing with it - I wonder if it would suffice simply to sum the resource_id, vol_create_time and vol_serial_number or maybe hash them. (3) The fscache_cookie_def is no more and needed information is passed directly to fscache_acquire_cookie(). The cache no longer calls back into the filesystem, but rather metadata changes are indicated at other times. fscache_acquire_cookie() is passed the same keying and coherency information as before. (4) The functions to set/reset cookies are removed and fscache_use_cookie() and fscache_unuse_cookie() are used instead. fscache_use_cookie() is passed a flag to indicate if the cookie is opened for writing. fscache_unuse_cookie() is passed updates for the metadata if we changed it (ie. if the file was opened for writing). These are called when the file is opened or closed. (5) cifs_setattr_*() are made to call fscache_resize() to change the size of the cache object. (6) The functions to read and write data are stubbed out pending a conversion to use netfslib. Changes ======= ver #8: - Abstract cache invalidation into a helper function. - Fix some checkpatch warnings[3]. ver #7: - Removed the accidentally added-back call to get the super cookie in cifs_root_iget(). - Fixed the right call to cifs_fscache_get_super_cookie() to take account of the "-o fsc" mount flag. ver #6: - Moved the change of gfpflags_allow_blocking() to current_is_kswapd() for cifs here. - Fixed one of the error paths in cifs_atomic_open() to jump around the call to use the cookie. - Fixed an additional successful return in the middle of cifs_open() to use the cookie on the way out. - Only get a volume cookie (and thus inode cookies) when "-o fsc" is supplied to mount. ver #5: - Fixed a couple of bits of cookie handling[2]: - The cookie should be released in cifs_evict_inode(), not cifsFileInfo_put_final(). The cookie needs to persist beyond file closure so that writepages will be able to write to it. - fscache_use_cookie() needs to be called in cifs_atomic_open() as it is for cifs_open(). ver #4: - Fixed the use of sizeof with memset. - tcon->vol_create_time is __le64 so doesn't need cpu_to_le64(). ver #3: - Canonicalise the cifs coherency data to make the cache portable. - Set volume coherency data. ver #2: - Use gfpflags_allow_blocking() rather than using flag directly. - Upgraded to -rc4 to allow for upstream changes[1]. - fscache_acquire_volume() now returns errors. Signed-off-by: David Howells <dhowells@redhat.com> Acked-by: Jeff Layton <jlayton@kernel.org> cc: Steve French <smfrench@gmail.com> cc: Shyam Prasad N <nspmangalore@gmail.com> cc: linux-cifs@vger.kernel.org cc: linux-cachefs@redhat.com Link: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=23b55d673d7527b093cd97b7c217c82e70cd1af0 [1] Link: https://lore.kernel.org/r/3419813.1641592362@warthog.procyon.org.uk/ [2] Link: https://lore.kernel.org/r/CAH2r5muTanw9pJqzAHd01d9A8keeChkzGsCEH6=0rHutVLAF-A@mail.gmail.com/ [3] Link: https://lore.kernel.org/r/163819671009.215744.11230627184193298714.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/163906982979.143852.10672081929614953210.stgit@warthog.procyon.org.uk/ # v2 Link: https://lore.kernel.org/r/163967187187.1823006.247415138444991444.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/164021579335.640689.2681324337038770579.stgit@warthog.procyon.org.uk/ # v4 Link: https://lore.kernel.org/r/3462849.1641593783@warthog.procyon.org.uk/ # v5 Link: https://lore.kernel.org/r/1318953.1642024578@warthog.procyon.org.uk/ # v6 Signed-off-by: Steve French <stfrench@microsoft.com>
2020-11-17 15:56:59 +00:00
.write_inode = cifs_write_inode,
.free_inode = cifs_free_inode,
.drop_inode = cifs_drop_inode,
.evict_inode = cifs_evict_inode,
/* .show_path = cifs_show_path, */ /* Would we ever need show path? */
.show_devname = cifs_show_devname,
/* .delete_inode = cifs_delete_inode, */ /* Do not need above
function unless later we add lazy close of inodes or unless the
kernel forgets to call us with the same number of releases (closes)
as opens */
.show_options = cifs_show_options,
2005-11-09 23:21:09 +00:00
.umount_begin = cifs_umount_begin,
.freeze_fs = cifs_freeze,
#ifdef CONFIG_CIFS_STATS2
.show_stats = cifs_show_stats,
#endif
};
/*
* Get root dentry from superblock according to prefix path mount option.
* Return dentry with refcount + 1 on success and NULL otherwise.
*/
static struct dentry *
cifs_get_root(struct smb3_fs_context *ctx, struct super_block *sb)
{
struct dentry *dentry;
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
char *full_path = NULL;
char *s, *p;
char sep;
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH)
return dget(sb->s_root);
full_path = cifs_build_path_to_root(ctx, cifs_sb,
cifs_sb_master_tcon(cifs_sb), 0);
if (full_path == NULL)
return ERR_PTR(-ENOMEM);
cifs_dbg(FYI, "Get root dentry for %s\n", full_path);
sep = CIFS_DIR_SEP(cifs_sb);
dentry = dget(sb->s_root);
s = full_path;
do {
struct inode *dir = d_inode(dentry);
struct dentry *child;
cifs: ensure that cifs_get_root() only traverses directories Kjell Braden reported this oops: [ 833.211970] BUG: unable to handle kernel NULL pointer dereference at (null) [ 833.212816] IP: [< (null)>] (null) [ 833.213280] PGD 1b9b2067 PUD e9f7067 PMD 0 [ 833.213874] Oops: 0010 [#1] SMP [ 833.214344] CPU 0 [ 833.214458] Modules linked in: des_generic md4 nls_utf8 cifs vboxvideo drm snd_intel8x0 snd_ac97_codec ac97_bus snd_pcm snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq bnep rfcomm snd_timer bluetooth snd_seq_device ppdev snd vboxguest parport_pc joydev mac_hid soundcore snd_page_alloc psmouse i2c_piix4 serio_raw lp parport usbhid hid e1000 [ 833.215629] [ 833.215629] Pid: 1752, comm: mount.cifs Not tainted 3.0.0-rc7-bisectcifs-fec11dd9a0+ #18 innotek GmbH VirtualBox/VirtualBox [ 833.215629] RIP: 0010:[<0000000000000000>] [< (null)>] (null) [ 833.215629] RSP: 0018:ffff8800119c9c50 EFLAGS: 00010282 [ 833.215629] RAX: ffffffffa02186c0 RBX: ffff88000c427780 RCX: 0000000000000000 [ 833.215629] RDX: 0000000000000000 RSI: ffff88000c427780 RDI: ffff88000c4362e8 [ 833.215629] RBP: ffff8800119c9c88 R08: ffff88001fc15e30 R09: 00000000d69515c7 [ 833.215629] R10: ffffffffa0201972 R11: ffff88000e8f6a28 R12: ffff88000c4362e8 [ 833.215629] R13: 0000000000000000 R14: 0000000000000000 R15: ffff88001181aaa6 [ 833.215629] FS: 00007f2986171700(0000) GS:ffff88001fc00000(0000) knlGS:0000000000000000 [ 833.215629] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 833.215629] CR2: 0000000000000000 CR3: 000000001b982000 CR4: 00000000000006f0 [ 833.215629] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 833.215629] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [ 833.215629] Process mount.cifs (pid: 1752, threadinfo ffff8800119c8000, task ffff88001c1c16f0) [ 833.215629] Stack: [ 833.215629] ffffffff8116a9b5 ffff8800119c9c88 ffffffff81178075 0000000000000286 [ 833.215629] 0000000000000000 ffff88000c4276c0 ffff8800119c9ce8 ffff8800119c9cc8 [ 833.215629] ffffffff8116b06e ffff88001bc6fc00 ffff88000c4276c0 ffff88000c4276c0 [ 833.215629] Call Trace: [ 833.215629] [<ffffffff8116a9b5>] ? d_alloc_and_lookup+0x45/0x90 [ 833.215629] [<ffffffff81178075>] ? d_lookup+0x35/0x60 [ 833.215629] [<ffffffff8116b06e>] __lookup_hash.part.14+0x9e/0xc0 [ 833.215629] [<ffffffff8116b1d6>] lookup_one_len+0x146/0x1e0 [ 833.215629] [<ffffffff815e4f7e>] ? _raw_spin_lock+0xe/0x20 [ 833.215629] [<ffffffffa01eef0d>] cifs_do_mount+0x26d/0x500 [cifs] [ 833.215629] [<ffffffff81163bd3>] mount_fs+0x43/0x1b0 [ 833.215629] [<ffffffff8117d41a>] vfs_kern_mount+0x6a/0xd0 [ 833.215629] [<ffffffff8117e584>] do_kern_mount+0x54/0x110 [ 833.215629] [<ffffffff8117fdc2>] do_mount+0x262/0x840 [ 833.215629] [<ffffffff81108a0e>] ? __get_free_pages+0xe/0x50 [ 833.215629] [<ffffffff8117f9ca>] ? copy_mount_options+0x3a/0x180 [ 833.215629] [<ffffffff8118075d>] sys_mount+0x8d/0xe0 [ 833.215629] [<ffffffff815ece82>] system_call_fastpath+0x16/0x1b [ 833.215629] Code: Bad RIP value. [ 833.215629] RIP [< (null)>] (null) [ 833.215629] RSP <ffff8800119c9c50> [ 833.215629] CR2: 0000000000000000 [ 833.238525] ---[ end trace ec00758b8d44f529 ]--- When walking down the path on the server, it's possible to hit a symlink. The path walking code assumes that the caller will handle that situation properly, but cifs_get_root() isn't set up for it. This patch prevents the oops by simply returning an error. A better solution would be to try and chase the symlinks here, but that's fairly complicated to handle. Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=53221 Reported-and-tested-by: Kjell Braden <afflux@pentabarf.de> Cc: stable <stable@vger.kernel.org> Signed-off-by: Jeff Layton <jlayton@redhat.com> Signed-off-by: Steve French <sfrench@us.ibm.com>
2013-02-01 20:11:01 +00:00
if (!S_ISDIR(dir->i_mode)) {
dput(dentry);
dentry = ERR_PTR(-ENOTDIR);
break;
}
/* skip separators */
while (*s == sep)
s++;
if (!*s)
break;
p = s++;
/* next separator */
while (*s && *s != sep)
s++;
child = lookup_positive_unlocked(p, dentry, s - p);
dput(dentry);
dentry = child;
} while (!IS_ERR(dentry));
kfree(full_path);
return dentry;
}
static int cifs_set_super(struct super_block *sb, void *data)
{
struct cifs_mnt_data *mnt_data = data;
sb->s_fs_info = mnt_data->cifs_sb;
return set_anon_super(sb, NULL);
}
struct dentry *
cifs_smb3_do_mount(struct file_system_type *fs_type,
int flags, struct smb3_fs_context *old_ctx)
{
struct cifs_mnt_data mnt_data;
struct cifs_sb_info *cifs_sb;
struct super_block *sb;
struct dentry *root;
int rc;
if (cifsFYI) {
cifs_dbg(FYI, "%s: devname=%s flags=0x%x\n", __func__,
old_ctx->source, flags);
} else {
cifs_info("Attempting to mount %s\n", old_ctx->source);
}
cifs_sb = kzalloc(sizeof(*cifs_sb), GFP_KERNEL);
if (!cifs_sb)
return ERR_PTR(-ENOMEM);
cifs_sb->ctx = kzalloc(sizeof(struct smb3_fs_context), GFP_KERNEL);
if (!cifs_sb->ctx) {
root = ERR_PTR(-ENOMEM);
goto out;
}
rc = smb3_fs_context_dup(cifs_sb->ctx, old_ctx);
if (rc) {
root = ERR_PTR(rc);
goto out;
}
rc = cifs_setup_cifs_sb(cifs_sb);
if (rc) {
root = ERR_PTR(rc);
goto out;
}
rc = cifs_mount(cifs_sb, cifs_sb->ctx);
if (rc) {
Rename superblock flags (MS_xyz -> SB_xyz) This is a pure automated search-and-replace of the internal kernel superblock flags. The s_flags are now called SB_*, with the names and the values for the moment mirroring the MS_* flags that they're equivalent to. Note how the MS_xyz flags are the ones passed to the mount system call, while the SB_xyz flags are what we then use in sb->s_flags. The script to do this was: # places to look in; re security/*: it generally should *not* be # touched (that stuff parses mount(2) arguments directly), but # there are two places where we really deal with superblock flags. FILES="drivers/mtd drivers/staging/lustre fs ipc mm \ include/linux/fs.h include/uapi/linux/bfs_fs.h \ security/apparmor/apparmorfs.c security/apparmor/include/lib.h" # the list of MS_... constants SYMS="RDONLY NOSUID NODEV NOEXEC SYNCHRONOUS REMOUNT MANDLOCK \ DIRSYNC NOATIME NODIRATIME BIND MOVE REC VERBOSE SILENT \ POSIXACL UNBINDABLE PRIVATE SLAVE SHARED RELATIME KERNMOUNT \ I_VERSION STRICTATIME LAZYTIME SUBMOUNT NOREMOTELOCK NOSEC BORN \ ACTIVE NOUSER" SED_PROG= for i in $SYMS; do SED_PROG="$SED_PROG -e s/MS_$i/SB_$i/g"; done # we want files that contain at least one of MS_..., # with fs/namespace.c and fs/pnode.c excluded. L=$(for i in $SYMS; do git grep -w -l MS_$i $FILES; done| sort|uniq|grep -v '^fs/namespace.c'|grep -v '^fs/pnode.c') for f in $L; do sed -i $f $SED_PROG; done Requested-by: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-11-27 21:05:09 +00:00
if (!(flags & SB_SILENT))
cifs_dbg(VFS, "cifs_mount failed w/return code = %d\n",
rc);
root = ERR_PTR(rc);
goto out;
}
mnt_data.ctx = cifs_sb->ctx;
mnt_data.cifs_sb = cifs_sb;
mnt_data.flags = flags;
/* BB should we make this contingent on mount parm? */
Rename superblock flags (MS_xyz -> SB_xyz) This is a pure automated search-and-replace of the internal kernel superblock flags. The s_flags are now called SB_*, with the names and the values for the moment mirroring the MS_* flags that they're equivalent to. Note how the MS_xyz flags are the ones passed to the mount system call, while the SB_xyz flags are what we then use in sb->s_flags. The script to do this was: # places to look in; re security/*: it generally should *not* be # touched (that stuff parses mount(2) arguments directly), but # there are two places where we really deal with superblock flags. FILES="drivers/mtd drivers/staging/lustre fs ipc mm \ include/linux/fs.h include/uapi/linux/bfs_fs.h \ security/apparmor/apparmorfs.c security/apparmor/include/lib.h" # the list of MS_... constants SYMS="RDONLY NOSUID NODEV NOEXEC SYNCHRONOUS REMOUNT MANDLOCK \ DIRSYNC NOATIME NODIRATIME BIND MOVE REC VERBOSE SILENT \ POSIXACL UNBINDABLE PRIVATE SLAVE SHARED RELATIME KERNMOUNT \ I_VERSION STRICTATIME LAZYTIME SUBMOUNT NOREMOTELOCK NOSEC BORN \ ACTIVE NOUSER" SED_PROG= for i in $SYMS; do SED_PROG="$SED_PROG -e s/MS_$i/SB_$i/g"; done # we want files that contain at least one of MS_..., # with fs/namespace.c and fs/pnode.c excluded. L=$(for i in $SYMS; do git grep -w -l MS_$i $FILES; done| sort|uniq|grep -v '^fs/namespace.c'|grep -v '^fs/pnode.c') for f in $L; do sed -i $f $SED_PROG; done Requested-by: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-11-27 21:05:09 +00:00
flags |= SB_NODIRATIME | SB_NOATIME;
sb = sget(fs_type, cifs_match_super, cifs_set_super, flags, &mnt_data);
if (IS_ERR(sb)) {
cifs_umount(cifs_sb);
return ERR_CAST(sb);
}
if (sb->s_root) {
cifs_dbg(FYI, "Use existing superblock\n");
cifs_umount(cifs_sb);
cifs_sb = NULL;
} else {
rc = cifs_read_super(sb);
if (rc) {
root = ERR_PTR(rc);
goto out_super;
}
Rename superblock flags (MS_xyz -> SB_xyz) This is a pure automated search-and-replace of the internal kernel superblock flags. The s_flags are now called SB_*, with the names and the values for the moment mirroring the MS_* flags that they're equivalent to. Note how the MS_xyz flags are the ones passed to the mount system call, while the SB_xyz flags are what we then use in sb->s_flags. The script to do this was: # places to look in; re security/*: it generally should *not* be # touched (that stuff parses mount(2) arguments directly), but # there are two places where we really deal with superblock flags. FILES="drivers/mtd drivers/staging/lustre fs ipc mm \ include/linux/fs.h include/uapi/linux/bfs_fs.h \ security/apparmor/apparmorfs.c security/apparmor/include/lib.h" # the list of MS_... constants SYMS="RDONLY NOSUID NODEV NOEXEC SYNCHRONOUS REMOUNT MANDLOCK \ DIRSYNC NOATIME NODIRATIME BIND MOVE REC VERBOSE SILENT \ POSIXACL UNBINDABLE PRIVATE SLAVE SHARED RELATIME KERNMOUNT \ I_VERSION STRICTATIME LAZYTIME SUBMOUNT NOREMOTELOCK NOSEC BORN \ ACTIVE NOUSER" SED_PROG= for i in $SYMS; do SED_PROG="$SED_PROG -e s/MS_$i/SB_$i/g"; done # we want files that contain at least one of MS_..., # with fs/namespace.c and fs/pnode.c excluded. L=$(for i in $SYMS; do git grep -w -l MS_$i $FILES; done| sort|uniq|grep -v '^fs/namespace.c'|grep -v '^fs/pnode.c') for f in $L; do sed -i $f $SED_PROG; done Requested-by: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-11-27 21:05:09 +00:00
sb->s_flags |= SB_ACTIVE;
}
root = cifs_get_root(cifs_sb ? cifs_sb->ctx : old_ctx, sb);
if (IS_ERR(root))
goto out_super;
if (cifs_sb)
cifs_sb->root = dget(root);
cifs_dbg(FYI, "dentry root is: %p\n", root);
return root;
out_super:
deactivate_locked_super(sb);
cifs: fix double free race when mount fails in cifs_get_root() When cifs_get_root() fails during cifs_smb3_do_mount() we call deactivate_locked_super() which eventually will call delayed_free() which will free the context. In this situation we should not proceed to enter the out: section in cifs_smb3_do_mount() and free the same resources a second time. [Thu Feb 10 12:59:06 2022] BUG: KASAN: use-after-free in rcu_cblist_dequeue+0x32/0x60 [Thu Feb 10 12:59:06 2022] Read of size 8 at addr ffff888364f4d110 by task swapper/1/0 [Thu Feb 10 12:59:06 2022] CPU: 1 PID: 0 Comm: swapper/1 Tainted: G OE 5.17.0-rc3+ #4 [Thu Feb 10 12:59:06 2022] Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS Hyper-V UEFI Release v4.0 12/17/2019 [Thu Feb 10 12:59:06 2022] Call Trace: [Thu Feb 10 12:59:06 2022] <IRQ> [Thu Feb 10 12:59:06 2022] dump_stack_lvl+0x5d/0x78 [Thu Feb 10 12:59:06 2022] print_address_description.constprop.0+0x24/0x150 [Thu Feb 10 12:59:06 2022] ? rcu_cblist_dequeue+0x32/0x60 [Thu Feb 10 12:59:06 2022] kasan_report.cold+0x7d/0x117 [Thu Feb 10 12:59:06 2022] ? rcu_cblist_dequeue+0x32/0x60 [Thu Feb 10 12:59:06 2022] __asan_load8+0x86/0xa0 [Thu Feb 10 12:59:06 2022] rcu_cblist_dequeue+0x32/0x60 [Thu Feb 10 12:59:06 2022] rcu_core+0x547/0xca0 [Thu Feb 10 12:59:06 2022] ? call_rcu+0x3c0/0x3c0 [Thu Feb 10 12:59:06 2022] ? __this_cpu_preempt_check+0x13/0x20 [Thu Feb 10 12:59:06 2022] ? lock_is_held_type+0xea/0x140 [Thu Feb 10 12:59:06 2022] rcu_core_si+0xe/0x10 [Thu Feb 10 12:59:06 2022] __do_softirq+0x1d4/0x67b [Thu Feb 10 12:59:06 2022] __irq_exit_rcu+0x100/0x150 [Thu Feb 10 12:59:06 2022] irq_exit_rcu+0xe/0x30 [Thu Feb 10 12:59:06 2022] sysvec_hyperv_stimer0+0x9d/0xc0 ... [Thu Feb 10 12:59:07 2022] Freed by task 58179: [Thu Feb 10 12:59:07 2022] kasan_save_stack+0x26/0x50 [Thu Feb 10 12:59:07 2022] kasan_set_track+0x25/0x30 [Thu Feb 10 12:59:07 2022] kasan_set_free_info+0x24/0x40 [Thu Feb 10 12:59:07 2022] ____kasan_slab_free+0x137/0x170 [Thu Feb 10 12:59:07 2022] __kasan_slab_free+0x12/0x20 [Thu Feb 10 12:59:07 2022] slab_free_freelist_hook+0xb3/0x1d0 [Thu Feb 10 12:59:07 2022] kfree+0xcd/0x520 [Thu Feb 10 12:59:07 2022] cifs_smb3_do_mount+0x149/0xbe0 [cifs] [Thu Feb 10 12:59:07 2022] smb3_get_tree+0x1a0/0x2e0 [cifs] [Thu Feb 10 12:59:07 2022] vfs_get_tree+0x52/0x140 [Thu Feb 10 12:59:07 2022] path_mount+0x635/0x10c0 [Thu Feb 10 12:59:07 2022] __x64_sys_mount+0x1bf/0x210 [Thu Feb 10 12:59:07 2022] do_syscall_64+0x5c/0xc0 [Thu Feb 10 12:59:07 2022] entry_SYSCALL_64_after_hwframe+0x44/0xae [Thu Feb 10 12:59:07 2022] Last potentially related work creation: [Thu Feb 10 12:59:07 2022] kasan_save_stack+0x26/0x50 [Thu Feb 10 12:59:07 2022] __kasan_record_aux_stack+0xb6/0xc0 [Thu Feb 10 12:59:07 2022] kasan_record_aux_stack_noalloc+0xb/0x10 [Thu Feb 10 12:59:07 2022] call_rcu+0x76/0x3c0 [Thu Feb 10 12:59:07 2022] cifs_umount+0xce/0xe0 [cifs] [Thu Feb 10 12:59:07 2022] cifs_kill_sb+0xc8/0xe0 [cifs] [Thu Feb 10 12:59:07 2022] deactivate_locked_super+0x5d/0xd0 [Thu Feb 10 12:59:07 2022] cifs_smb3_do_mount+0xab9/0xbe0 [cifs] [Thu Feb 10 12:59:07 2022] smb3_get_tree+0x1a0/0x2e0 [cifs] [Thu Feb 10 12:59:07 2022] vfs_get_tree+0x52/0x140 [Thu Feb 10 12:59:07 2022] path_mount+0x635/0x10c0 [Thu Feb 10 12:59:07 2022] __x64_sys_mount+0x1bf/0x210 [Thu Feb 10 12:59:07 2022] do_syscall_64+0x5c/0xc0 [Thu Feb 10 12:59:07 2022] entry_SYSCALL_64_after_hwframe+0x44/0xae Reported-by: Shyam Prasad N <sprasad@microsoft.com> Reviewed-by: Shyam Prasad N <sprasad@microsoft.com> Signed-off-by: Ronnie Sahlberg <lsahlber@redhat.com> Signed-off-by: Steve French <stfrench@microsoft.com>
2022-02-10 16:59:15 +00:00
return root;
out:
kfree(cifs_sb->prepath);
smb3_cleanup_fs_context(cifs_sb->ctx);
kfree(cifs_sb);
return root;
}
static ssize_t
cifs_loose_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
ssize_t rc;
struct inode *inode = file_inode(iocb->ki_filp);
if (iocb->ki_flags & IOCB_DIRECT)
return cifs_user_readv(iocb, iter);
rc = cifs_revalidate_mapping(inode);
if (rc)
return rc;
return generic_file_read_iter(iocb, iter);
}
static ssize_t cifs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct inode *inode = file_inode(iocb->ki_filp);
cifs: Wait for writebacks to complete before attempting write. Problem reported in Red Hat bz 1040329 for strict writes where we cache only when we hold oplock and write direct to the server when we don't. When we receive an oplock break, we first change the oplock value for the inode in cifsInodeInfo->oplock to indicate that we no longer hold the oplock before we enqueue a task to flush changes to the backing device. Once we have completed flushing the changes, we return the oplock to the server. There are 2 ways here where we can have data corruption 1) While we flush changes to the backing device as part of the oplock break, we can have processes write to the file. These writes check for the oplock, find none and attempt to write directly to the server. These direct writes made while we are flushing from cache could be overwritten by data being flushed from the cache causing data corruption. 2) While a thread runs in cifs_strict_writev, the machine could receive and process an oplock break after the thread has checked the oplock and found that it allows us to cache and before we have made changes to the cache. In that case, we end up with a dirty page in cache when we shouldn't have any. This will be flushed later and will overwrite all subsequent writes to the part of the file represented by this page. Before making any writes to the server, we need to confirm that we are not in the process of flushing data to the server and if we are, we should wait until the process is complete before we attempt the write. We should also wait for existing writes to complete before we process an oplock break request which changes oplock values. We add a version specific downgrade_oplock() operation to allow for differences in the oplock values set for the different smb versions. Cc: stable@vger.kernel.org Signed-off-by: Sachin Prabhu <sprabhu@redhat.com> Reviewed-by: Jeff Layton <jlayton@redhat.com> Reviewed-by: Pavel Shilovsky <piastry@etersoft.ru> Signed-off-by: Steve French <smfrench@gmail.com>
2014-03-11 16:11:47 +00:00
struct cifsInodeInfo *cinode = CIFS_I(inode);
ssize_t written;
int rc;
if (iocb->ki_filp->f_flags & O_DIRECT) {
written = cifs_user_writev(iocb, from);
if (written > 0 && CIFS_CACHE_READ(cinode)) {
cifs_zap_mapping(inode);
cifs_dbg(FYI,
"Set no oplock for inode=%p after a write operation\n",
inode);
cinode->oplock = 0;
}
return written;
}
cifs: Wait for writebacks to complete before attempting write. Problem reported in Red Hat bz 1040329 for strict writes where we cache only when we hold oplock and write direct to the server when we don't. When we receive an oplock break, we first change the oplock value for the inode in cifsInodeInfo->oplock to indicate that we no longer hold the oplock before we enqueue a task to flush changes to the backing device. Once we have completed flushing the changes, we return the oplock to the server. There are 2 ways here where we can have data corruption 1) While we flush changes to the backing device as part of the oplock break, we can have processes write to the file. These writes check for the oplock, find none and attempt to write directly to the server. These direct writes made while we are flushing from cache could be overwritten by data being flushed from the cache causing data corruption. 2) While a thread runs in cifs_strict_writev, the machine could receive and process an oplock break after the thread has checked the oplock and found that it allows us to cache and before we have made changes to the cache. In that case, we end up with a dirty page in cache when we shouldn't have any. This will be flushed later and will overwrite all subsequent writes to the part of the file represented by this page. Before making any writes to the server, we need to confirm that we are not in the process of flushing data to the server and if we are, we should wait until the process is complete before we attempt the write. We should also wait for existing writes to complete before we process an oplock break request which changes oplock values. We add a version specific downgrade_oplock() operation to allow for differences in the oplock values set for the different smb versions. Cc: stable@vger.kernel.org Signed-off-by: Sachin Prabhu <sprabhu@redhat.com> Reviewed-by: Jeff Layton <jlayton@redhat.com> Reviewed-by: Pavel Shilovsky <piastry@etersoft.ru> Signed-off-by: Steve French <smfrench@gmail.com>
2014-03-11 16:11:47 +00:00
written = cifs_get_writer(cinode);
if (written)
return written;
written = generic_file_write_iter(iocb, from);
if (CIFS_CACHE_WRITE(CIFS_I(inode)))
cifs: Wait for writebacks to complete before attempting write. Problem reported in Red Hat bz 1040329 for strict writes where we cache only when we hold oplock and write direct to the server when we don't. When we receive an oplock break, we first change the oplock value for the inode in cifsInodeInfo->oplock to indicate that we no longer hold the oplock before we enqueue a task to flush changes to the backing device. Once we have completed flushing the changes, we return the oplock to the server. There are 2 ways here where we can have data corruption 1) While we flush changes to the backing device as part of the oplock break, we can have processes write to the file. These writes check for the oplock, find none and attempt to write directly to the server. These direct writes made while we are flushing from cache could be overwritten by data being flushed from the cache causing data corruption. 2) While a thread runs in cifs_strict_writev, the machine could receive and process an oplock break after the thread has checked the oplock and found that it allows us to cache and before we have made changes to the cache. In that case, we end up with a dirty page in cache when we shouldn't have any. This will be flushed later and will overwrite all subsequent writes to the part of the file represented by this page. Before making any writes to the server, we need to confirm that we are not in the process of flushing data to the server and if we are, we should wait until the process is complete before we attempt the write. We should also wait for existing writes to complete before we process an oplock break request which changes oplock values. We add a version specific downgrade_oplock() operation to allow for differences in the oplock values set for the different smb versions. Cc: stable@vger.kernel.org Signed-off-by: Sachin Prabhu <sprabhu@redhat.com> Reviewed-by: Jeff Layton <jlayton@redhat.com> Reviewed-by: Pavel Shilovsky <piastry@etersoft.ru> Signed-off-by: Steve French <smfrench@gmail.com>
2014-03-11 16:11:47 +00:00
goto out;
rc = filemap_fdatawrite(inode->i_mapping);
if (rc)
cifs_dbg(FYI, "cifs_file_write_iter: %d rc on %p inode\n",
rc, inode);
cifs: Wait for writebacks to complete before attempting write. Problem reported in Red Hat bz 1040329 for strict writes where we cache only when we hold oplock and write direct to the server when we don't. When we receive an oplock break, we first change the oplock value for the inode in cifsInodeInfo->oplock to indicate that we no longer hold the oplock before we enqueue a task to flush changes to the backing device. Once we have completed flushing the changes, we return the oplock to the server. There are 2 ways here where we can have data corruption 1) While we flush changes to the backing device as part of the oplock break, we can have processes write to the file. These writes check for the oplock, find none and attempt to write directly to the server. These direct writes made while we are flushing from cache could be overwritten by data being flushed from the cache causing data corruption. 2) While a thread runs in cifs_strict_writev, the machine could receive and process an oplock break after the thread has checked the oplock and found that it allows us to cache and before we have made changes to the cache. In that case, we end up with a dirty page in cache when we shouldn't have any. This will be flushed later and will overwrite all subsequent writes to the part of the file represented by this page. Before making any writes to the server, we need to confirm that we are not in the process of flushing data to the server and if we are, we should wait until the process is complete before we attempt the write. We should also wait for existing writes to complete before we process an oplock break request which changes oplock values. We add a version specific downgrade_oplock() operation to allow for differences in the oplock values set for the different smb versions. Cc: stable@vger.kernel.org Signed-off-by: Sachin Prabhu <sprabhu@redhat.com> Reviewed-by: Jeff Layton <jlayton@redhat.com> Reviewed-by: Pavel Shilovsky <piastry@etersoft.ru> Signed-off-by: Steve French <smfrench@gmail.com>
2014-03-11 16:11:47 +00:00
out:
cifs_put_writer(cinode);
return written;
}
static loff_t cifs_llseek(struct file *file, loff_t offset, int whence)
{
struct cifsFileInfo *cfile = file->private_data;
struct cifs_tcon *tcon;
/*
* whence == SEEK_END || SEEK_DATA || SEEK_HOLE => we must revalidate
* the cached file length
*/
if (whence != SEEK_SET && whence != SEEK_CUR) {
int rc;
struct inode *inode = file_inode(file);
/*
* We need to be sure that all dirty pages are written and the
* server has the newest file length.
*/
if (!CIFS_CACHE_READ(CIFS_I(inode)) && inode->i_mapping &&
inode->i_mapping->nrpages != 0) {
rc = filemap_fdatawait(inode->i_mapping);
if (rc) {
mapping_set_error(inode->i_mapping, rc);
return rc;
}
}
/*
* Some applications poll for the file length in this strange
* way so we must seek to end on non-oplocked files by
* setting the revalidate time to zero.
*/
CIFS_I(inode)->time = 0;
rc = cifs_revalidate_file_attr(file);
if (rc < 0)
return (loff_t)rc;
}
if (cfile && cfile->tlink) {
tcon = tlink_tcon(cfile->tlink);
if (tcon->ses->server->ops->llseek)
return tcon->ses->server->ops->llseek(file, tcon,
offset, whence);
}
return generic_file_llseek(file, offset, whence);
}
static int
cifs_setlease(struct file *file, int arg, struct file_lease **lease, void **priv)
{
/*
* Note that this is called by vfs setlease with i_lock held to
* protect *lease from going away.
*/
struct inode *inode = file_inode(file);
struct cifsFileInfo *cfile = file->private_data;
/* Check if file is oplocked if this is request for new lease */
if (arg == F_UNLCK ||
((arg == F_RDLCK) && CIFS_CACHE_READ(CIFS_I(inode))) ||
((arg == F_WRLCK) && CIFS_CACHE_WRITE(CIFS_I(inode))))
return generic_setlease(file, arg, lease, priv);
else if (tlink_tcon(cfile->tlink)->local_lease &&
!CIFS_CACHE_READ(CIFS_I(inode)))
/*
* If the server claims to support oplock on this file, then we
* still need to check oplock even if the local_lease mount
* option is set, but there are servers which do not support
* oplock for which this mount option may be useful if the user
* knows that the file won't be changed on the server by anyone
* else.
*/
return generic_setlease(file, arg, lease, priv);
else
return -EAGAIN;
}
struct file_system_type cifs_fs_type = {
.owner = THIS_MODULE,
.name = "cifs",
.init_fs_context = smb3_init_fs_context,
.parameters = smb3_fs_parameters,
.kill_sb = cifs_kill_sb,
.fs_flags = FS_RENAME_DOES_D_MOVE,
};
MODULE_ALIAS_FS("cifs");
struct file_system_type smb3_fs_type = {
.owner = THIS_MODULE,
.name = "smb3",
.init_fs_context = smb3_init_fs_context,
.parameters = smb3_fs_parameters,
.kill_sb = cifs_kill_sb,
.fs_flags = FS_RENAME_DOES_D_MOVE,
};
MODULE_ALIAS_FS("smb3");
MODULE_ALIAS("smb3");
const struct inode_operations cifs_dir_inode_ops = {
.create = cifs_create,
.atomic_open = cifs_atomic_open,
.lookup = cifs_lookup,
.getattr = cifs_getattr,
.unlink = cifs_unlink,
.link = cifs_hardlink,
.mkdir = cifs_mkdir,
.rmdir = cifs_rmdir,
.rename = cifs_rename2,
.permission = cifs_permission,
.setattr = cifs_setattr,
.symlink = cifs_symlink,
.mknod = cifs_mknod,
.listxattr = cifs_listxattr,
cifs: implement get acl method The current way of setting and getting posix acls through the generic xattr interface is error prone and type unsafe. The vfs needs to interpret and fixup posix acls before storing or reporting it to userspace. Various hacks exist to make this work. The code is hard to understand and difficult to maintain in it's current form. Instead of making this work by hacking posix acls through xattr handlers we are building a dedicated posix acl api around the get and set inode operations. This removes a lot of hackiness and makes the codepaths easier to maintain. A lot of background can be found in [1]. In order to build a type safe posix api around get and set acl we need all filesystem to implement get and set acl. So far cifs wasn't able to implement get and set acl inode operations because it needs access to the dentry. Now that we extended the set acl inode operation to take a dentry argument and added a new get acl inode operation that takes a dentry argument we can let cifs implement get and set acl inode operations. This is mostly a copy and paste of the codepaths currently used in cifs' posix acl xattr handler. After we have fully implemented the posix acl api and switched the vfs over to it, the cifs specific posix acl xattr handler and associated code will be removed and the code duplication will go away. Note, until the vfs has been switched to the new posix acl api this patch is a non-functional change. Link: https://lore.kernel.org/all/20220801145520.1532837-1-brauner@kernel.org [1] Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
2022-09-22 15:17:02 +00:00
.get_acl = cifs_get_acl,
cifs: implement set acl method The current way of setting and getting posix acls through the generic xattr interface is error prone and type unsafe. The vfs needs to interpret and fixup posix acls before storing or reporting it to userspace. Various hacks exist to make this work. The code is hard to understand and difficult to maintain in it's current form. Instead of making this work by hacking posix acls through xattr handlers we are building a dedicated posix acl api around the get and set inode operations. This removes a lot of hackiness and makes the codepaths easier to maintain. A lot of background can be found in [1]. In order to build a type safe posix api around get and set acl we need all filesystem to implement get and set acl. So far cifs wasn't able to implement get and set acl inode operations because it needs access to the dentry. Now that we extended the set acl inode operation to take a dentry argument and added a new get acl inode operation that takes a dentry argument we can let cifs implement get and set acl inode operations. This is mostly a copy and paste of the codepaths currently used in cifs' posix acl xattr handler. After we have fully implemented the posix acl api and switched the vfs over to it, the cifs specific posix acl xattr handler and associated code will be removed and the code duplication will go away. Note, until the vfs has been switched to the new posix acl api this patch is a non-functional change. Link: https://lore.kernel.org/all/20220801145520.1532837-1-brauner@kernel.org [1] Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
2022-09-22 15:17:03 +00:00
.set_acl = cifs_set_acl,
};
const struct inode_operations cifs_file_inode_ops = {
.setattr = cifs_setattr,
.getattr = cifs_getattr,
.permission = cifs_permission,
.listxattr = cifs_listxattr,
.fiemap = cifs_fiemap,
cifs: implement get acl method The current way of setting and getting posix acls through the generic xattr interface is error prone and type unsafe. The vfs needs to interpret and fixup posix acls before storing or reporting it to userspace. Various hacks exist to make this work. The code is hard to understand and difficult to maintain in it's current form. Instead of making this work by hacking posix acls through xattr handlers we are building a dedicated posix acl api around the get and set inode operations. This removes a lot of hackiness and makes the codepaths easier to maintain. A lot of background can be found in [1]. In order to build a type safe posix api around get and set acl we need all filesystem to implement get and set acl. So far cifs wasn't able to implement get and set acl inode operations because it needs access to the dentry. Now that we extended the set acl inode operation to take a dentry argument and added a new get acl inode operation that takes a dentry argument we can let cifs implement get and set acl inode operations. This is mostly a copy and paste of the codepaths currently used in cifs' posix acl xattr handler. After we have fully implemented the posix acl api and switched the vfs over to it, the cifs specific posix acl xattr handler and associated code will be removed and the code duplication will go away. Note, until the vfs has been switched to the new posix acl api this patch is a non-functional change. Link: https://lore.kernel.org/all/20220801145520.1532837-1-brauner@kernel.org [1] Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
2022-09-22 15:17:02 +00:00
.get_acl = cifs_get_acl,
cifs: implement set acl method The current way of setting and getting posix acls through the generic xattr interface is error prone and type unsafe. The vfs needs to interpret and fixup posix acls before storing or reporting it to userspace. Various hacks exist to make this work. The code is hard to understand and difficult to maintain in it's current form. Instead of making this work by hacking posix acls through xattr handlers we are building a dedicated posix acl api around the get and set inode operations. This removes a lot of hackiness and makes the codepaths easier to maintain. A lot of background can be found in [1]. In order to build a type safe posix api around get and set acl we need all filesystem to implement get and set acl. So far cifs wasn't able to implement get and set acl inode operations because it needs access to the dentry. Now that we extended the set acl inode operation to take a dentry argument and added a new get acl inode operation that takes a dentry argument we can let cifs implement get and set acl inode operations. This is mostly a copy and paste of the codepaths currently used in cifs' posix acl xattr handler. After we have fully implemented the posix acl api and switched the vfs over to it, the cifs specific posix acl xattr handler and associated code will be removed and the code duplication will go away. Note, until the vfs has been switched to the new posix acl api this patch is a non-functional change. Link: https://lore.kernel.org/all/20220801145520.1532837-1-brauner@kernel.org [1] Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
2022-09-22 15:17:03 +00:00
.set_acl = cifs_set_acl,
};
cifs: fix use-after-free on the link name xfstests generic/011 reported use-after-free bug as follows: BUG: KASAN: use-after-free in __d_alloc+0x269/0x859 Read of size 15 at addr ffff8880078933a0 by task dirstress/952 CPU: 1 PID: 952 Comm: dirstress Not tainted 6.1.0-rc3+ #77 Call Trace: __dump_stack+0x23/0x29 dump_stack_lvl+0x51/0x73 print_address_description+0x67/0x27f print_report+0x3e/0x5c kasan_report+0x7b/0xa8 kasan_check_range+0x1b2/0x1c1 memcpy+0x22/0x5d __d_alloc+0x269/0x859 d_alloc+0x45/0x20c d_alloc_parallel+0xb2/0x8b2 lookup_open+0x3b8/0x9f9 open_last_lookups+0x63d/0xc26 path_openat+0x11a/0x261 do_filp_open+0xcc/0x168 do_sys_openat2+0x13b/0x3f7 do_sys_open+0x10f/0x146 __se_sys_creat+0x27/0x2e __x64_sys_creat+0x55/0x6a do_syscall_64+0x40/0x96 entry_SYSCALL_64_after_hwframe+0x63/0xcd Allocated by task 952: kasan_save_stack+0x1f/0x42 kasan_set_track+0x21/0x2a kasan_save_alloc_info+0x17/0x1d __kasan_kmalloc+0x7e/0x87 __kmalloc_node_track_caller+0x59/0x155 kstrndup+0x60/0xe6 parse_mf_symlink+0x215/0x30b check_mf_symlink+0x260/0x36a cifs_get_inode_info+0x14e1/0x1690 cifs_revalidate_dentry_attr+0x70d/0x964 cifs_revalidate_dentry+0x36/0x62 cifs_d_revalidate+0x162/0x446 lookup_open+0x36f/0x9f9 open_last_lookups+0x63d/0xc26 path_openat+0x11a/0x261 do_filp_open+0xcc/0x168 do_sys_openat2+0x13b/0x3f7 do_sys_open+0x10f/0x146 __se_sys_creat+0x27/0x2e __x64_sys_creat+0x55/0x6a do_syscall_64+0x40/0x96 entry_SYSCALL_64_after_hwframe+0x63/0xcd Freed by task 950: kasan_save_stack+0x1f/0x42 kasan_set_track+0x21/0x2a kasan_save_free_info+0x1c/0x34 ____kasan_slab_free+0x1c1/0x1d5 __kasan_slab_free+0xe/0x13 __kmem_cache_free+0x29a/0x387 kfree+0xd3/0x10e cifs_fattr_to_inode+0xb6a/0xc8c cifs_get_inode_info+0x3cb/0x1690 cifs_revalidate_dentry_attr+0x70d/0x964 cifs_revalidate_dentry+0x36/0x62 cifs_d_revalidate+0x162/0x446 lookup_open+0x36f/0x9f9 open_last_lookups+0x63d/0xc26 path_openat+0x11a/0x261 do_filp_open+0xcc/0x168 do_sys_openat2+0x13b/0x3f7 do_sys_open+0x10f/0x146 __se_sys_creat+0x27/0x2e __x64_sys_creat+0x55/0x6a do_syscall_64+0x40/0x96 entry_SYSCALL_64_after_hwframe+0x63/0xcd When opened a symlink, link name is from 'inode->i_link', but it may be reset to a new value when revalidate the dentry. If some processes get the link name on the race scenario, then UAF will happen on link name. Fix this by implementing 'get_link' interface to duplicate the link name. Fixes: 76894f3e2f71 ("cifs: improve symlink handling for smb2+") Signed-off-by: ChenXiaoSong <chenxiaosong2@huawei.com> Reviewed-by: Paulo Alcantara (SUSE) <pc@cjr.nz> Signed-off-by: Steve French <stfrench@microsoft.com>
2022-11-04 07:44:41 +00:00
const char *cifs_get_link(struct dentry *dentry, struct inode *inode,
struct delayed_call *done)
{
char *target_path;
if (!dentry)
return ERR_PTR(-ECHILD);
cifs: fix use-after-free on the link name xfstests generic/011 reported use-after-free bug as follows: BUG: KASAN: use-after-free in __d_alloc+0x269/0x859 Read of size 15 at addr ffff8880078933a0 by task dirstress/952 CPU: 1 PID: 952 Comm: dirstress Not tainted 6.1.0-rc3+ #77 Call Trace: __dump_stack+0x23/0x29 dump_stack_lvl+0x51/0x73 print_address_description+0x67/0x27f print_report+0x3e/0x5c kasan_report+0x7b/0xa8 kasan_check_range+0x1b2/0x1c1 memcpy+0x22/0x5d __d_alloc+0x269/0x859 d_alloc+0x45/0x20c d_alloc_parallel+0xb2/0x8b2 lookup_open+0x3b8/0x9f9 open_last_lookups+0x63d/0xc26 path_openat+0x11a/0x261 do_filp_open+0xcc/0x168 do_sys_openat2+0x13b/0x3f7 do_sys_open+0x10f/0x146 __se_sys_creat+0x27/0x2e __x64_sys_creat+0x55/0x6a do_syscall_64+0x40/0x96 entry_SYSCALL_64_after_hwframe+0x63/0xcd Allocated by task 952: kasan_save_stack+0x1f/0x42 kasan_set_track+0x21/0x2a kasan_save_alloc_info+0x17/0x1d __kasan_kmalloc+0x7e/0x87 __kmalloc_node_track_caller+0x59/0x155 kstrndup+0x60/0xe6 parse_mf_symlink+0x215/0x30b check_mf_symlink+0x260/0x36a cifs_get_inode_info+0x14e1/0x1690 cifs_revalidate_dentry_attr+0x70d/0x964 cifs_revalidate_dentry+0x36/0x62 cifs_d_revalidate+0x162/0x446 lookup_open+0x36f/0x9f9 open_last_lookups+0x63d/0xc26 path_openat+0x11a/0x261 do_filp_open+0xcc/0x168 do_sys_openat2+0x13b/0x3f7 do_sys_open+0x10f/0x146 __se_sys_creat+0x27/0x2e __x64_sys_creat+0x55/0x6a do_syscall_64+0x40/0x96 entry_SYSCALL_64_after_hwframe+0x63/0xcd Freed by task 950: kasan_save_stack+0x1f/0x42 kasan_set_track+0x21/0x2a kasan_save_free_info+0x1c/0x34 ____kasan_slab_free+0x1c1/0x1d5 __kasan_slab_free+0xe/0x13 __kmem_cache_free+0x29a/0x387 kfree+0xd3/0x10e cifs_fattr_to_inode+0xb6a/0xc8c cifs_get_inode_info+0x3cb/0x1690 cifs_revalidate_dentry_attr+0x70d/0x964 cifs_revalidate_dentry+0x36/0x62 cifs_d_revalidate+0x162/0x446 lookup_open+0x36f/0x9f9 open_last_lookups+0x63d/0xc26 path_openat+0x11a/0x261 do_filp_open+0xcc/0x168 do_sys_openat2+0x13b/0x3f7 do_sys_open+0x10f/0x146 __se_sys_creat+0x27/0x2e __x64_sys_creat+0x55/0x6a do_syscall_64+0x40/0x96 entry_SYSCALL_64_after_hwframe+0x63/0xcd When opened a symlink, link name is from 'inode->i_link', but it may be reset to a new value when revalidate the dentry. If some processes get the link name on the race scenario, then UAF will happen on link name. Fix this by implementing 'get_link' interface to duplicate the link name. Fixes: 76894f3e2f71 ("cifs: improve symlink handling for smb2+") Signed-off-by: ChenXiaoSong <chenxiaosong2@huawei.com> Reviewed-by: Paulo Alcantara (SUSE) <pc@cjr.nz> Signed-off-by: Steve French <stfrench@microsoft.com>
2022-11-04 07:44:41 +00:00
target_path = kmalloc(PATH_MAX, GFP_KERNEL);
if (!target_path)
return ERR_PTR(-ENOMEM);
spin_lock(&inode->i_lock);
if (likely(CIFS_I(inode)->symlink_target)) {
strscpy(target_path, CIFS_I(inode)->symlink_target, PATH_MAX);
} else {
kfree(target_path);
target_path = ERR_PTR(-EOPNOTSUPP);
}
spin_unlock(&inode->i_lock);
if (!IS_ERR(target_path))
set_delayed_call(done, kfree_link, target_path);
return target_path;
}
const struct inode_operations cifs_symlink_inode_ops = {
cifs: fix use-after-free on the link name xfstests generic/011 reported use-after-free bug as follows: BUG: KASAN: use-after-free in __d_alloc+0x269/0x859 Read of size 15 at addr ffff8880078933a0 by task dirstress/952 CPU: 1 PID: 952 Comm: dirstress Not tainted 6.1.0-rc3+ #77 Call Trace: __dump_stack+0x23/0x29 dump_stack_lvl+0x51/0x73 print_address_description+0x67/0x27f print_report+0x3e/0x5c kasan_report+0x7b/0xa8 kasan_check_range+0x1b2/0x1c1 memcpy+0x22/0x5d __d_alloc+0x269/0x859 d_alloc+0x45/0x20c d_alloc_parallel+0xb2/0x8b2 lookup_open+0x3b8/0x9f9 open_last_lookups+0x63d/0xc26 path_openat+0x11a/0x261 do_filp_open+0xcc/0x168 do_sys_openat2+0x13b/0x3f7 do_sys_open+0x10f/0x146 __se_sys_creat+0x27/0x2e __x64_sys_creat+0x55/0x6a do_syscall_64+0x40/0x96 entry_SYSCALL_64_after_hwframe+0x63/0xcd Allocated by task 952: kasan_save_stack+0x1f/0x42 kasan_set_track+0x21/0x2a kasan_save_alloc_info+0x17/0x1d __kasan_kmalloc+0x7e/0x87 __kmalloc_node_track_caller+0x59/0x155 kstrndup+0x60/0xe6 parse_mf_symlink+0x215/0x30b check_mf_symlink+0x260/0x36a cifs_get_inode_info+0x14e1/0x1690 cifs_revalidate_dentry_attr+0x70d/0x964 cifs_revalidate_dentry+0x36/0x62 cifs_d_revalidate+0x162/0x446 lookup_open+0x36f/0x9f9 open_last_lookups+0x63d/0xc26 path_openat+0x11a/0x261 do_filp_open+0xcc/0x168 do_sys_openat2+0x13b/0x3f7 do_sys_open+0x10f/0x146 __se_sys_creat+0x27/0x2e __x64_sys_creat+0x55/0x6a do_syscall_64+0x40/0x96 entry_SYSCALL_64_after_hwframe+0x63/0xcd Freed by task 950: kasan_save_stack+0x1f/0x42 kasan_set_track+0x21/0x2a kasan_save_free_info+0x1c/0x34 ____kasan_slab_free+0x1c1/0x1d5 __kasan_slab_free+0xe/0x13 __kmem_cache_free+0x29a/0x387 kfree+0xd3/0x10e cifs_fattr_to_inode+0xb6a/0xc8c cifs_get_inode_info+0x3cb/0x1690 cifs_revalidate_dentry_attr+0x70d/0x964 cifs_revalidate_dentry+0x36/0x62 cifs_d_revalidate+0x162/0x446 lookup_open+0x36f/0x9f9 open_last_lookups+0x63d/0xc26 path_openat+0x11a/0x261 do_filp_open+0xcc/0x168 do_sys_openat2+0x13b/0x3f7 do_sys_open+0x10f/0x146 __se_sys_creat+0x27/0x2e __x64_sys_creat+0x55/0x6a do_syscall_64+0x40/0x96 entry_SYSCALL_64_after_hwframe+0x63/0xcd When opened a symlink, link name is from 'inode->i_link', but it may be reset to a new value when revalidate the dentry. If some processes get the link name on the race scenario, then UAF will happen on link name. Fix this by implementing 'get_link' interface to duplicate the link name. Fixes: 76894f3e2f71 ("cifs: improve symlink handling for smb2+") Signed-off-by: ChenXiaoSong <chenxiaosong2@huawei.com> Reviewed-by: Paulo Alcantara (SUSE) <pc@cjr.nz> Signed-off-by: Steve French <stfrench@microsoft.com>
2022-11-04 07:44:41 +00:00
.get_link = cifs_get_link,
.setattr = cifs_setattr,
.permission = cifs_permission,
.listxattr = cifs_listxattr,
};
cifs: Fix flushing, invalidation and file size with copy_file_range() Fix a number of issues in the cifs filesystem implementation of the copy_file_range() syscall in cifs_file_copychunk_range(). Firstly, the invalidation of the destination range is handled incorrectly: We shouldn't just invalidate the whole file as dirty data in the file may get lost and we can't just call truncate_inode_pages_range() to invalidate the destination range as that will erase parts of a partial folio at each end whilst invalidating and discarding all the folios in the middle. We need to force all the folios covering the range to be reloaded, but we mustn't lose dirty data in them that's not in the destination range. Further, we shouldn't simply round out the range to PAGE_SIZE at each end as cifs should move to support multipage folios. Secondly, there's an issue whereby a write may have extended the file locally, but not have been written back yet. This can leaves the local idea of the EOF at a later point than the server's EOF. If a copy request is issued, this will fail on the server with STATUS_INVALID_VIEW_SIZE (which gets translated to -EIO locally) if the copy source extends past the server's EOF. Fix this by: (0) Flush the source region (already done). The flush does nothing and the EOF isn't moved if the source region has no dirty data. (1) Move the EOF to the end of the source region if it isn't already at least at this point. If we can't do this, for instance if the server doesn't support it, just flush the entire source file. (2) Find the folio (if present) at each end of the range, flushing it and increasing the region-to-be-invalidated to cover those in their entirety. (3) Fully discard all the folios covering the range as we want them to be reloaded. (4) Then perform the copy. Thirdly, set i_size after doing the copychunk_range operation as this value may be used by various things internally. stat() hides the issue because setting ->time to 0 causes cifs_getatr() to revalidate the attributes. These were causing the generic/075 xfstest to fail. Fixes: 620d8745b35d ("Introduce cifs_copy_file_range()") Cc: stable@vger.kernel.org Signed-off-by: David Howells <dhowells@redhat.com> cc: Paulo Alcantara <pc@manguebit.com> cc: Shyam Prasad N <nspmangalore@gmail.com> cc: Rohith Surabattula <rohiths.msft@gmail.com> cc: Matthew Wilcox <willy@infradead.org> cc: Jeff Layton <jlayton@kernel.org> cc: linux-cifs@vger.kernel.org cc: linux-mm@kvack.org Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: Steve French <stfrench@microsoft.com>
2023-12-01 00:22:00 +00:00
/*
* Advance the EOF marker to after the source range.
*/
static int cifs_precopy_set_eof(struct inode *src_inode, struct cifsInodeInfo *src_cifsi,
struct cifs_tcon *src_tcon,
unsigned int xid, loff_t src_end)
{
struct cifsFileInfo *writeable_srcfile;
int rc = -EINVAL;
writeable_srcfile = find_writable_file(src_cifsi, FIND_WR_FSUID_ONLY);
if (writeable_srcfile) {
if (src_tcon->ses->server->ops->set_file_size)
rc = src_tcon->ses->server->ops->set_file_size(
xid, src_tcon, writeable_srcfile,
src_inode->i_size, true /* no need to set sparse */);
else
rc = -ENOSYS;
cifsFileInfo_put(writeable_srcfile);
cifs_dbg(FYI, "SetFSize for copychunk rc = %d\n", rc);
}
if (rc < 0)
goto set_failed;
netfs: Optimise away reads above the point at which there can be no data Track the file position above which the server is not expected to have any data (the "zero point") and preemptively assume that we can satisfy requests by filling them with zeroes locally rather than attempting to download them if they're over that line - even if we've written data back to the server. Assume that any data that was written back above that position is held in the local cache. Note that we have to split requests that straddle the line. Make use of this to optimise away some reads from the server. We need to set the zero point in the following circumstances: (1) When we see an extant remote inode and have no cache for it, we set the zero_point to i_size. (2) On local inode creation, we set zero_point to 0. (3) On local truncation down, we reduce zero_point to the new i_size if the new i_size is lower. (4) On local truncation up, we don't change zero_point. (5) On local modification, we don't change zero_point. (6) On remote invalidation, we set zero_point to the new i_size. (7) If stored data is discarded from the pagecache or culled from fscache, we must set zero_point above that if the data also got written to the server. (8) If dirty data is written back to the server, but not fscache, we must set zero_point above that. (9) If a direct I/O write is made, set zero_point above that. Assuming the above, any read from the server at or above the zero_point position will return all zeroes. The zero_point value can be stored in the cache, provided the above rules are applied to it by any code that culls part of the local cache. Signed-off-by: David Howells <dhowells@redhat.com> cc: Jeff Layton <jlayton@kernel.org> cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org
2023-11-24 13:39:02 +00:00
netfs_resize_file(&src_cifsi->netfs, src_end, true);
cifs: Fix flushing, invalidation and file size with copy_file_range() Fix a number of issues in the cifs filesystem implementation of the copy_file_range() syscall in cifs_file_copychunk_range(). Firstly, the invalidation of the destination range is handled incorrectly: We shouldn't just invalidate the whole file as dirty data in the file may get lost and we can't just call truncate_inode_pages_range() to invalidate the destination range as that will erase parts of a partial folio at each end whilst invalidating and discarding all the folios in the middle. We need to force all the folios covering the range to be reloaded, but we mustn't lose dirty data in them that's not in the destination range. Further, we shouldn't simply round out the range to PAGE_SIZE at each end as cifs should move to support multipage folios. Secondly, there's an issue whereby a write may have extended the file locally, but not have been written back yet. This can leaves the local idea of the EOF at a later point than the server's EOF. If a copy request is issued, this will fail on the server with STATUS_INVALID_VIEW_SIZE (which gets translated to -EIO locally) if the copy source extends past the server's EOF. Fix this by: (0) Flush the source region (already done). The flush does nothing and the EOF isn't moved if the source region has no dirty data. (1) Move the EOF to the end of the source region if it isn't already at least at this point. If we can't do this, for instance if the server doesn't support it, just flush the entire source file. (2) Find the folio (if present) at each end of the range, flushing it and increasing the region-to-be-invalidated to cover those in their entirety. (3) Fully discard all the folios covering the range as we want them to be reloaded. (4) Then perform the copy. Thirdly, set i_size after doing the copychunk_range operation as this value may be used by various things internally. stat() hides the issue because setting ->time to 0 causes cifs_getatr() to revalidate the attributes. These were causing the generic/075 xfstest to fail. Fixes: 620d8745b35d ("Introduce cifs_copy_file_range()") Cc: stable@vger.kernel.org Signed-off-by: David Howells <dhowells@redhat.com> cc: Paulo Alcantara <pc@manguebit.com> cc: Shyam Prasad N <nspmangalore@gmail.com> cc: Rohith Surabattula <rohiths.msft@gmail.com> cc: Matthew Wilcox <willy@infradead.org> cc: Jeff Layton <jlayton@kernel.org> cc: linux-cifs@vger.kernel.org cc: linux-mm@kvack.org Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: Steve French <stfrench@microsoft.com>
2023-12-01 00:22:00 +00:00
fscache_resize_cookie(cifs_inode_cookie(src_inode), src_end);
return 0;
set_failed:
return filemap_write_and_wait(src_inode->i_mapping);
}
/*
* Flush out either the folio that overlaps the beginning of a range in which
* pos resides or the folio that overlaps the end of a range unless that folio
* is entirely within the range we're going to invalidate. We extend the flush
* bounds to encompass the folio.
*/
static int cifs_flush_folio(struct inode *inode, loff_t pos, loff_t *_fstart, loff_t *_fend,
bool first)
{
struct folio *folio;
unsigned long long fpos, fend;
pgoff_t index = pos / PAGE_SIZE;
size_t size;
int rc = 0;
folio = filemap_get_folio(inode->i_mapping, index);
if (IS_ERR(folio))
return 0;
size = folio_size(folio);
fpos = folio_pos(folio);
fend = fpos + size - 1;
*_fstart = min_t(unsigned long long, *_fstart, fpos);
*_fend = max_t(unsigned long long, *_fend, fend);
if ((first && pos == fpos) || (!first && pos == fend))
goto out;
rc = filemap_write_and_wait_range(inode->i_mapping, fpos, fend);
out:
folio_put(folio);
return rc;
}
static loff_t cifs_remap_file_range(struct file *src_file, loff_t off,
struct file *dst_file, loff_t destoff, loff_t len,
unsigned int remap_flags)
{
struct inode *src_inode = file_inode(src_file);
struct inode *target_inode = file_inode(dst_file);
cifs: Fix flushing, invalidation and file size with FICLONE Fix a number of issues in the cifs filesystem implementation of the FICLONE ioctl in cifs_remap_file_range(). This is analogous to the previously fixed bug in cifs_file_copychunk_range() and can share the helper functions. Firstly, the invalidation of the destination range is handled incorrectly: We shouldn't just invalidate the whole file as dirty data in the file may get lost and we can't just call truncate_inode_pages_range() to invalidate the destination range as that will erase parts of a partial folio at each end whilst invalidating and discarding all the folios in the middle. We need to force all the folios covering the range to be reloaded, but we mustn't lose dirty data in them that's not in the destination range. Further, we shouldn't simply round out the range to PAGE_SIZE at each end as cifs should move to support multipage folios. Secondly, there's an issue whereby a write may have extended the file locally, but not have been written back yet. This can leaves the local idea of the EOF at a later point than the server's EOF. If a clone request is issued, this will fail on the server with STATUS_INVALID_VIEW_SIZE (which gets translated to -EIO locally) if the clone source extends past the server's EOF. Fix this by: (0) Flush the source region (already done). The flush does nothing and the EOF isn't moved if the source region has no dirty data. (1) Move the EOF to the end of the source region if it isn't already at least at this point. If we can't do this, for instance if the server doesn't support it, just flush the entire source file. (2) Find the folio (if present) at each end of the range, flushing it and increasing the region-to-be-invalidated to cover those in their entirety. (3) Fully discard all the folios covering the range as we want them to be reloaded. (4) Then perform the extent duplication. Thirdly, set i_size after doing the duplicate_extents operation as this value may be used by various things internally. stat() hides the issue because setting ->time to 0 causes cifs_getatr() to revalidate the attributes. These were causing the cifs/001 xfstest to fail. Fixes: 04b38d601239 ("vfs: pull btrfs clone API to vfs layer") Signed-off-by: David Howells <dhowells@redhat.com> Cc: stable@vger.kernel.org cc: Christoph Hellwig <hch@lst.de> cc: Paulo Alcantara <pc@manguebit.com> cc: Shyam Prasad N <nspmangalore@gmail.com> cc: Rohith Surabattula <rohiths.msft@gmail.com> cc: Matthew Wilcox <willy@infradead.org> cc: Jeff Layton <jlayton@kernel.org> cc: linux-cifs@vger.kernel.org cc: linux-mm@kvack.org Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: Steve French <stfrench@microsoft.com>
2023-12-01 00:22:01 +00:00
struct cifsInodeInfo *src_cifsi = CIFS_I(src_inode);
struct cifsInodeInfo *target_cifsi = CIFS_I(target_inode);
struct cifsFileInfo *smb_file_src = src_file->private_data;
cifs: Fix flushing, invalidation and file size with FICLONE Fix a number of issues in the cifs filesystem implementation of the FICLONE ioctl in cifs_remap_file_range(). This is analogous to the previously fixed bug in cifs_file_copychunk_range() and can share the helper functions. Firstly, the invalidation of the destination range is handled incorrectly: We shouldn't just invalidate the whole file as dirty data in the file may get lost and we can't just call truncate_inode_pages_range() to invalidate the destination range as that will erase parts of a partial folio at each end whilst invalidating and discarding all the folios in the middle. We need to force all the folios covering the range to be reloaded, but we mustn't lose dirty data in them that's not in the destination range. Further, we shouldn't simply round out the range to PAGE_SIZE at each end as cifs should move to support multipage folios. Secondly, there's an issue whereby a write may have extended the file locally, but not have been written back yet. This can leaves the local idea of the EOF at a later point than the server's EOF. If a clone request is issued, this will fail on the server with STATUS_INVALID_VIEW_SIZE (which gets translated to -EIO locally) if the clone source extends past the server's EOF. Fix this by: (0) Flush the source region (already done). The flush does nothing and the EOF isn't moved if the source region has no dirty data. (1) Move the EOF to the end of the source region if it isn't already at least at this point. If we can't do this, for instance if the server doesn't support it, just flush the entire source file. (2) Find the folio (if present) at each end of the range, flushing it and increasing the region-to-be-invalidated to cover those in their entirety. (3) Fully discard all the folios covering the range as we want them to be reloaded. (4) Then perform the extent duplication. Thirdly, set i_size after doing the duplicate_extents operation as this value may be used by various things internally. stat() hides the issue because setting ->time to 0 causes cifs_getatr() to revalidate the attributes. These were causing the cifs/001 xfstest to fail. Fixes: 04b38d601239 ("vfs: pull btrfs clone API to vfs layer") Signed-off-by: David Howells <dhowells@redhat.com> Cc: stable@vger.kernel.org cc: Christoph Hellwig <hch@lst.de> cc: Paulo Alcantara <pc@manguebit.com> cc: Shyam Prasad N <nspmangalore@gmail.com> cc: Rohith Surabattula <rohiths.msft@gmail.com> cc: Matthew Wilcox <willy@infradead.org> cc: Jeff Layton <jlayton@kernel.org> cc: linux-cifs@vger.kernel.org cc: linux-mm@kvack.org Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: Steve French <stfrench@microsoft.com>
2023-12-01 00:22:01 +00:00
struct cifsFileInfo *smb_file_target = dst_file->private_data;
struct cifs_tcon *target_tcon, *src_tcon;
unsigned long long destend, fstart, fend, new_size;
unsigned int xid;
int rc;
if (remap_flags & REMAP_FILE_DEDUP)
return -EOPNOTSUPP;
if (remap_flags & ~REMAP_FILE_ADVISORY)
return -EINVAL;
cifs_dbg(FYI, "clone range\n");
xid = get_xid();
cifs: Fix flushing, invalidation and file size with FICLONE Fix a number of issues in the cifs filesystem implementation of the FICLONE ioctl in cifs_remap_file_range(). This is analogous to the previously fixed bug in cifs_file_copychunk_range() and can share the helper functions. Firstly, the invalidation of the destination range is handled incorrectly: We shouldn't just invalidate the whole file as dirty data in the file may get lost and we can't just call truncate_inode_pages_range() to invalidate the destination range as that will erase parts of a partial folio at each end whilst invalidating and discarding all the folios in the middle. We need to force all the folios covering the range to be reloaded, but we mustn't lose dirty data in them that's not in the destination range. Further, we shouldn't simply round out the range to PAGE_SIZE at each end as cifs should move to support multipage folios. Secondly, there's an issue whereby a write may have extended the file locally, but not have been written back yet. This can leaves the local idea of the EOF at a later point than the server's EOF. If a clone request is issued, this will fail on the server with STATUS_INVALID_VIEW_SIZE (which gets translated to -EIO locally) if the clone source extends past the server's EOF. Fix this by: (0) Flush the source region (already done). The flush does nothing and the EOF isn't moved if the source region has no dirty data. (1) Move the EOF to the end of the source region if it isn't already at least at this point. If we can't do this, for instance if the server doesn't support it, just flush the entire source file. (2) Find the folio (if present) at each end of the range, flushing it and increasing the region-to-be-invalidated to cover those in their entirety. (3) Fully discard all the folios covering the range as we want them to be reloaded. (4) Then perform the extent duplication. Thirdly, set i_size after doing the duplicate_extents operation as this value may be used by various things internally. stat() hides the issue because setting ->time to 0 causes cifs_getatr() to revalidate the attributes. These were causing the cifs/001 xfstest to fail. Fixes: 04b38d601239 ("vfs: pull btrfs clone API to vfs layer") Signed-off-by: David Howells <dhowells@redhat.com> Cc: stable@vger.kernel.org cc: Christoph Hellwig <hch@lst.de> cc: Paulo Alcantara <pc@manguebit.com> cc: Shyam Prasad N <nspmangalore@gmail.com> cc: Rohith Surabattula <rohiths.msft@gmail.com> cc: Matthew Wilcox <willy@infradead.org> cc: Jeff Layton <jlayton@kernel.org> cc: linux-cifs@vger.kernel.org cc: linux-mm@kvack.org Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: Steve French <stfrench@microsoft.com>
2023-12-01 00:22:01 +00:00
if (!smb_file_src || !smb_file_target) {
rc = -EBADF;
cifs_dbg(VFS, "missing cifsFileInfo on copy range src file\n");
goto out;
}
cifs: Fix flushing, invalidation and file size with FICLONE Fix a number of issues in the cifs filesystem implementation of the FICLONE ioctl in cifs_remap_file_range(). This is analogous to the previously fixed bug in cifs_file_copychunk_range() and can share the helper functions. Firstly, the invalidation of the destination range is handled incorrectly: We shouldn't just invalidate the whole file as dirty data in the file may get lost and we can't just call truncate_inode_pages_range() to invalidate the destination range as that will erase parts of a partial folio at each end whilst invalidating and discarding all the folios in the middle. We need to force all the folios covering the range to be reloaded, but we mustn't lose dirty data in them that's not in the destination range. Further, we shouldn't simply round out the range to PAGE_SIZE at each end as cifs should move to support multipage folios. Secondly, there's an issue whereby a write may have extended the file locally, but not have been written back yet. This can leaves the local idea of the EOF at a later point than the server's EOF. If a clone request is issued, this will fail on the server with STATUS_INVALID_VIEW_SIZE (which gets translated to -EIO locally) if the clone source extends past the server's EOF. Fix this by: (0) Flush the source region (already done). The flush does nothing and the EOF isn't moved if the source region has no dirty data. (1) Move the EOF to the end of the source region if it isn't already at least at this point. If we can't do this, for instance if the server doesn't support it, just flush the entire source file. (2) Find the folio (if present) at each end of the range, flushing it and increasing the region-to-be-invalidated to cover those in their entirety. (3) Fully discard all the folios covering the range as we want them to be reloaded. (4) Then perform the extent duplication. Thirdly, set i_size after doing the duplicate_extents operation as this value may be used by various things internally. stat() hides the issue because setting ->time to 0 causes cifs_getatr() to revalidate the attributes. These were causing the cifs/001 xfstest to fail. Fixes: 04b38d601239 ("vfs: pull btrfs clone API to vfs layer") Signed-off-by: David Howells <dhowells@redhat.com> Cc: stable@vger.kernel.org cc: Christoph Hellwig <hch@lst.de> cc: Paulo Alcantara <pc@manguebit.com> cc: Shyam Prasad N <nspmangalore@gmail.com> cc: Rohith Surabattula <rohiths.msft@gmail.com> cc: Matthew Wilcox <willy@infradead.org> cc: Jeff Layton <jlayton@kernel.org> cc: linux-cifs@vger.kernel.org cc: linux-mm@kvack.org Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: Steve French <stfrench@microsoft.com>
2023-12-01 00:22:01 +00:00
src_tcon = tlink_tcon(smb_file_src->tlink);
target_tcon = tlink_tcon(smb_file_target->tlink);
/*
* Note: cifs case is easier than btrfs since server responsible for
* checks for proper open modes and file type and if it wants
* server could even support copy of range where source = target
*/
lock_two_nondirectories(target_inode, src_inode);
if (len == 0)
len = src_inode->i_size - off;
cifs: Fix flushing, invalidation and file size with FICLONE Fix a number of issues in the cifs filesystem implementation of the FICLONE ioctl in cifs_remap_file_range(). This is analogous to the previously fixed bug in cifs_file_copychunk_range() and can share the helper functions. Firstly, the invalidation of the destination range is handled incorrectly: We shouldn't just invalidate the whole file as dirty data in the file may get lost and we can't just call truncate_inode_pages_range() to invalidate the destination range as that will erase parts of a partial folio at each end whilst invalidating and discarding all the folios in the middle. We need to force all the folios covering the range to be reloaded, but we mustn't lose dirty data in them that's not in the destination range. Further, we shouldn't simply round out the range to PAGE_SIZE at each end as cifs should move to support multipage folios. Secondly, there's an issue whereby a write may have extended the file locally, but not have been written back yet. This can leaves the local idea of the EOF at a later point than the server's EOF. If a clone request is issued, this will fail on the server with STATUS_INVALID_VIEW_SIZE (which gets translated to -EIO locally) if the clone source extends past the server's EOF. Fix this by: (0) Flush the source region (already done). The flush does nothing and the EOF isn't moved if the source region has no dirty data. (1) Move the EOF to the end of the source region if it isn't already at least at this point. If we can't do this, for instance if the server doesn't support it, just flush the entire source file. (2) Find the folio (if present) at each end of the range, flushing it and increasing the region-to-be-invalidated to cover those in their entirety. (3) Fully discard all the folios covering the range as we want them to be reloaded. (4) Then perform the extent duplication. Thirdly, set i_size after doing the duplicate_extents operation as this value may be used by various things internally. stat() hides the issue because setting ->time to 0 causes cifs_getatr() to revalidate the attributes. These were causing the cifs/001 xfstest to fail. Fixes: 04b38d601239 ("vfs: pull btrfs clone API to vfs layer") Signed-off-by: David Howells <dhowells@redhat.com> Cc: stable@vger.kernel.org cc: Christoph Hellwig <hch@lst.de> cc: Paulo Alcantara <pc@manguebit.com> cc: Shyam Prasad N <nspmangalore@gmail.com> cc: Rohith Surabattula <rohiths.msft@gmail.com> cc: Matthew Wilcox <willy@infradead.org> cc: Jeff Layton <jlayton@kernel.org> cc: linux-cifs@vger.kernel.org cc: linux-mm@kvack.org Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: Steve French <stfrench@microsoft.com>
2023-12-01 00:22:01 +00:00
cifs_dbg(FYI, "clone range\n");
/* Flush the source buffer */
rc = filemap_write_and_wait_range(src_inode->i_mapping, off,
off + len - 1);
if (rc)
goto unlock;
/* The server-side copy will fail if the source crosses the EOF marker.
* Advance the EOF marker after the flush above to the end of the range
* if it's short of that.
*/
if (src_cifsi->netfs.remote_i_size < off + len) {
rc = cifs_precopy_set_eof(src_inode, src_cifsi, src_tcon, xid, off + len);
if (rc < 0)
goto unlock;
}
new_size = destoff + len;
destend = destoff + len - 1;
cifs: Fix flushing, invalidation and file size with FICLONE Fix a number of issues in the cifs filesystem implementation of the FICLONE ioctl in cifs_remap_file_range(). This is analogous to the previously fixed bug in cifs_file_copychunk_range() and can share the helper functions. Firstly, the invalidation of the destination range is handled incorrectly: We shouldn't just invalidate the whole file as dirty data in the file may get lost and we can't just call truncate_inode_pages_range() to invalidate the destination range as that will erase parts of a partial folio at each end whilst invalidating and discarding all the folios in the middle. We need to force all the folios covering the range to be reloaded, but we mustn't lose dirty data in them that's not in the destination range. Further, we shouldn't simply round out the range to PAGE_SIZE at each end as cifs should move to support multipage folios. Secondly, there's an issue whereby a write may have extended the file locally, but not have been written back yet. This can leaves the local idea of the EOF at a later point than the server's EOF. If a clone request is issued, this will fail on the server with STATUS_INVALID_VIEW_SIZE (which gets translated to -EIO locally) if the clone source extends past the server's EOF. Fix this by: (0) Flush the source region (already done). The flush does nothing and the EOF isn't moved if the source region has no dirty data. (1) Move the EOF to the end of the source region if it isn't already at least at this point. If we can't do this, for instance if the server doesn't support it, just flush the entire source file. (2) Find the folio (if present) at each end of the range, flushing it and increasing the region-to-be-invalidated to cover those in their entirety. (3) Fully discard all the folios covering the range as we want them to be reloaded. (4) Then perform the extent duplication. Thirdly, set i_size after doing the duplicate_extents operation as this value may be used by various things internally. stat() hides the issue because setting ->time to 0 causes cifs_getatr() to revalidate the attributes. These were causing the cifs/001 xfstest to fail. Fixes: 04b38d601239 ("vfs: pull btrfs clone API to vfs layer") Signed-off-by: David Howells <dhowells@redhat.com> Cc: stable@vger.kernel.org cc: Christoph Hellwig <hch@lst.de> cc: Paulo Alcantara <pc@manguebit.com> cc: Shyam Prasad N <nspmangalore@gmail.com> cc: Rohith Surabattula <rohiths.msft@gmail.com> cc: Matthew Wilcox <willy@infradead.org> cc: Jeff Layton <jlayton@kernel.org> cc: linux-cifs@vger.kernel.org cc: linux-mm@kvack.org Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: Steve French <stfrench@microsoft.com>
2023-12-01 00:22:01 +00:00
/* Flush the folios at either end of the destination range to prevent
* accidental loss of dirty data outside of the range.
*/
fstart = destoff;
fend = destend;
rc = cifs_flush_folio(target_inode, destoff, &fstart, &fend, true);
if (rc)
goto unlock;
rc = cifs_flush_folio(target_inode, destend, &fstart, &fend, false);
if (rc)
goto unlock;
/* Discard all the folios that overlap the destination region. */
cifs_dbg(FYI, "about to discard pages %llx-%llx\n", fstart, fend);
truncate_inode_pages_range(&target_inode->i_data, fstart, fend);
fscache_invalidate(cifs_inode_cookie(target_inode), NULL,
i_size_read(target_inode), 0);
rc = -EOPNOTSUPP;
if (target_tcon->ses->server->ops->duplicate_extents) {
rc = target_tcon->ses->server->ops->duplicate_extents(xid,
smb_file_src, smb_file_target, off, len, destoff);
cifs: Fix flushing, invalidation and file size with FICLONE Fix a number of issues in the cifs filesystem implementation of the FICLONE ioctl in cifs_remap_file_range(). This is analogous to the previously fixed bug in cifs_file_copychunk_range() and can share the helper functions. Firstly, the invalidation of the destination range is handled incorrectly: We shouldn't just invalidate the whole file as dirty data in the file may get lost and we can't just call truncate_inode_pages_range() to invalidate the destination range as that will erase parts of a partial folio at each end whilst invalidating and discarding all the folios in the middle. We need to force all the folios covering the range to be reloaded, but we mustn't lose dirty data in them that's not in the destination range. Further, we shouldn't simply round out the range to PAGE_SIZE at each end as cifs should move to support multipage folios. Secondly, there's an issue whereby a write may have extended the file locally, but not have been written back yet. This can leaves the local idea of the EOF at a later point than the server's EOF. If a clone request is issued, this will fail on the server with STATUS_INVALID_VIEW_SIZE (which gets translated to -EIO locally) if the clone source extends past the server's EOF. Fix this by: (0) Flush the source region (already done). The flush does nothing and the EOF isn't moved if the source region has no dirty data. (1) Move the EOF to the end of the source region if it isn't already at least at this point. If we can't do this, for instance if the server doesn't support it, just flush the entire source file. (2) Find the folio (if present) at each end of the range, flushing it and increasing the region-to-be-invalidated to cover those in their entirety. (3) Fully discard all the folios covering the range as we want them to be reloaded. (4) Then perform the extent duplication. Thirdly, set i_size after doing the duplicate_extents operation as this value may be used by various things internally. stat() hides the issue because setting ->time to 0 causes cifs_getatr() to revalidate the attributes. These were causing the cifs/001 xfstest to fail. Fixes: 04b38d601239 ("vfs: pull btrfs clone API to vfs layer") Signed-off-by: David Howells <dhowells@redhat.com> Cc: stable@vger.kernel.org cc: Christoph Hellwig <hch@lst.de> cc: Paulo Alcantara <pc@manguebit.com> cc: Shyam Prasad N <nspmangalore@gmail.com> cc: Rohith Surabattula <rohiths.msft@gmail.com> cc: Matthew Wilcox <willy@infradead.org> cc: Jeff Layton <jlayton@kernel.org> cc: linux-cifs@vger.kernel.org cc: linux-mm@kvack.org Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: Steve French <stfrench@microsoft.com>
2023-12-01 00:22:01 +00:00
if (rc == 0 && new_size > i_size_read(target_inode)) {
truncate_setsize(target_inode, new_size);
netfs: Optimise away reads above the point at which there can be no data Track the file position above which the server is not expected to have any data (the "zero point") and preemptively assume that we can satisfy requests by filling them with zeroes locally rather than attempting to download them if they're over that line - even if we've written data back to the server. Assume that any data that was written back above that position is held in the local cache. Note that we have to split requests that straddle the line. Make use of this to optimise away some reads from the server. We need to set the zero point in the following circumstances: (1) When we see an extant remote inode and have no cache for it, we set the zero_point to i_size. (2) On local inode creation, we set zero_point to 0. (3) On local truncation down, we reduce zero_point to the new i_size if the new i_size is lower. (4) On local truncation up, we don't change zero_point. (5) On local modification, we don't change zero_point. (6) On remote invalidation, we set zero_point to the new i_size. (7) If stored data is discarded from the pagecache or culled from fscache, we must set zero_point above that if the data also got written to the server. (8) If dirty data is written back to the server, but not fscache, we must set zero_point above that. (9) If a direct I/O write is made, set zero_point above that. Assuming the above, any read from the server at or above the zero_point position will return all zeroes. The zero_point value can be stored in the cache, provided the above rules are applied to it by any code that culls part of the local cache. Signed-off-by: David Howells <dhowells@redhat.com> cc: Jeff Layton <jlayton@kernel.org> cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org
2023-11-24 13:39:02 +00:00
netfs_resize_file(&target_cifsi->netfs, new_size, true);
cifs: Fix flushing, invalidation and file size with FICLONE Fix a number of issues in the cifs filesystem implementation of the FICLONE ioctl in cifs_remap_file_range(). This is analogous to the previously fixed bug in cifs_file_copychunk_range() and can share the helper functions. Firstly, the invalidation of the destination range is handled incorrectly: We shouldn't just invalidate the whole file as dirty data in the file may get lost and we can't just call truncate_inode_pages_range() to invalidate the destination range as that will erase parts of a partial folio at each end whilst invalidating and discarding all the folios in the middle. We need to force all the folios covering the range to be reloaded, but we mustn't lose dirty data in them that's not in the destination range. Further, we shouldn't simply round out the range to PAGE_SIZE at each end as cifs should move to support multipage folios. Secondly, there's an issue whereby a write may have extended the file locally, but not have been written back yet. This can leaves the local idea of the EOF at a later point than the server's EOF. If a clone request is issued, this will fail on the server with STATUS_INVALID_VIEW_SIZE (which gets translated to -EIO locally) if the clone source extends past the server's EOF. Fix this by: (0) Flush the source region (already done). The flush does nothing and the EOF isn't moved if the source region has no dirty data. (1) Move the EOF to the end of the source region if it isn't already at least at this point. If we can't do this, for instance if the server doesn't support it, just flush the entire source file. (2) Find the folio (if present) at each end of the range, flushing it and increasing the region-to-be-invalidated to cover those in their entirety. (3) Fully discard all the folios covering the range as we want them to be reloaded. (4) Then perform the extent duplication. Thirdly, set i_size after doing the duplicate_extents operation as this value may be used by various things internally. stat() hides the issue because setting ->time to 0 causes cifs_getatr() to revalidate the attributes. These were causing the cifs/001 xfstest to fail. Fixes: 04b38d601239 ("vfs: pull btrfs clone API to vfs layer") Signed-off-by: David Howells <dhowells@redhat.com> Cc: stable@vger.kernel.org cc: Christoph Hellwig <hch@lst.de> cc: Paulo Alcantara <pc@manguebit.com> cc: Shyam Prasad N <nspmangalore@gmail.com> cc: Rohith Surabattula <rohiths.msft@gmail.com> cc: Matthew Wilcox <willy@infradead.org> cc: Jeff Layton <jlayton@kernel.org> cc: linux-cifs@vger.kernel.org cc: linux-mm@kvack.org Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: Steve French <stfrench@microsoft.com>
2023-12-01 00:22:01 +00:00
fscache_resize_cookie(cifs_inode_cookie(target_inode),
new_size);
}
}
/* force revalidate of size and timestamps of target file now
that target is updated on the server */
CIFS_I(target_inode)->time = 0;
cifs: Fix flushing, invalidation and file size with FICLONE Fix a number of issues in the cifs filesystem implementation of the FICLONE ioctl in cifs_remap_file_range(). This is analogous to the previously fixed bug in cifs_file_copychunk_range() and can share the helper functions. Firstly, the invalidation of the destination range is handled incorrectly: We shouldn't just invalidate the whole file as dirty data in the file may get lost and we can't just call truncate_inode_pages_range() to invalidate the destination range as that will erase parts of a partial folio at each end whilst invalidating and discarding all the folios in the middle. We need to force all the folios covering the range to be reloaded, but we mustn't lose dirty data in them that's not in the destination range. Further, we shouldn't simply round out the range to PAGE_SIZE at each end as cifs should move to support multipage folios. Secondly, there's an issue whereby a write may have extended the file locally, but not have been written back yet. This can leaves the local idea of the EOF at a later point than the server's EOF. If a clone request is issued, this will fail on the server with STATUS_INVALID_VIEW_SIZE (which gets translated to -EIO locally) if the clone source extends past the server's EOF. Fix this by: (0) Flush the source region (already done). The flush does nothing and the EOF isn't moved if the source region has no dirty data. (1) Move the EOF to the end of the source region if it isn't already at least at this point. If we can't do this, for instance if the server doesn't support it, just flush the entire source file. (2) Find the folio (if present) at each end of the range, flushing it and increasing the region-to-be-invalidated to cover those in their entirety. (3) Fully discard all the folios covering the range as we want them to be reloaded. (4) Then perform the extent duplication. Thirdly, set i_size after doing the duplicate_extents operation as this value may be used by various things internally. stat() hides the issue because setting ->time to 0 causes cifs_getatr() to revalidate the attributes. These were causing the cifs/001 xfstest to fail. Fixes: 04b38d601239 ("vfs: pull btrfs clone API to vfs layer") Signed-off-by: David Howells <dhowells@redhat.com> Cc: stable@vger.kernel.org cc: Christoph Hellwig <hch@lst.de> cc: Paulo Alcantara <pc@manguebit.com> cc: Shyam Prasad N <nspmangalore@gmail.com> cc: Rohith Surabattula <rohiths.msft@gmail.com> cc: Matthew Wilcox <willy@infradead.org> cc: Jeff Layton <jlayton@kernel.org> cc: linux-cifs@vger.kernel.org cc: linux-mm@kvack.org Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: Steve French <stfrench@microsoft.com>
2023-12-01 00:22:01 +00:00
unlock:
/* although unlocking in the reverse order from locking is not
strictly necessary here it is a little cleaner to be consistent */
unlock_two_nondirectories(src_inode, target_inode);
out:
free_xid(xid);
return rc < 0 ? rc : len;
}
ssize_t cifs_file_copychunk_range(unsigned int xid,
struct file *src_file, loff_t off,
struct file *dst_file, loff_t destoff,
size_t len, unsigned int flags)
{
struct inode *src_inode = file_inode(src_file);
struct inode *target_inode = file_inode(dst_file);
cifs: Fix flushing, invalidation and file size with copy_file_range() Fix a number of issues in the cifs filesystem implementation of the copy_file_range() syscall in cifs_file_copychunk_range(). Firstly, the invalidation of the destination range is handled incorrectly: We shouldn't just invalidate the whole file as dirty data in the file may get lost and we can't just call truncate_inode_pages_range() to invalidate the destination range as that will erase parts of a partial folio at each end whilst invalidating and discarding all the folios in the middle. We need to force all the folios covering the range to be reloaded, but we mustn't lose dirty data in them that's not in the destination range. Further, we shouldn't simply round out the range to PAGE_SIZE at each end as cifs should move to support multipage folios. Secondly, there's an issue whereby a write may have extended the file locally, but not have been written back yet. This can leaves the local idea of the EOF at a later point than the server's EOF. If a copy request is issued, this will fail on the server with STATUS_INVALID_VIEW_SIZE (which gets translated to -EIO locally) if the copy source extends past the server's EOF. Fix this by: (0) Flush the source region (already done). The flush does nothing and the EOF isn't moved if the source region has no dirty data. (1) Move the EOF to the end of the source region if it isn't already at least at this point. If we can't do this, for instance if the server doesn't support it, just flush the entire source file. (2) Find the folio (if present) at each end of the range, flushing it and increasing the region-to-be-invalidated to cover those in their entirety. (3) Fully discard all the folios covering the range as we want them to be reloaded. (4) Then perform the copy. Thirdly, set i_size after doing the copychunk_range operation as this value may be used by various things internally. stat() hides the issue because setting ->time to 0 causes cifs_getatr() to revalidate the attributes. These were causing the generic/075 xfstest to fail. Fixes: 620d8745b35d ("Introduce cifs_copy_file_range()") Cc: stable@vger.kernel.org Signed-off-by: David Howells <dhowells@redhat.com> cc: Paulo Alcantara <pc@manguebit.com> cc: Shyam Prasad N <nspmangalore@gmail.com> cc: Rohith Surabattula <rohiths.msft@gmail.com> cc: Matthew Wilcox <willy@infradead.org> cc: Jeff Layton <jlayton@kernel.org> cc: linux-cifs@vger.kernel.org cc: linux-mm@kvack.org Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: Steve French <stfrench@microsoft.com>
2023-12-01 00:22:00 +00:00
struct cifsInodeInfo *src_cifsi = CIFS_I(src_inode);
struct cifsInodeInfo *target_cifsi = CIFS_I(target_inode);
struct cifsFileInfo *smb_file_src;
struct cifsFileInfo *smb_file_target;
struct cifs_tcon *src_tcon;
struct cifs_tcon *target_tcon;
cifs: Fix flushing, invalidation and file size with copy_file_range() Fix a number of issues in the cifs filesystem implementation of the copy_file_range() syscall in cifs_file_copychunk_range(). Firstly, the invalidation of the destination range is handled incorrectly: We shouldn't just invalidate the whole file as dirty data in the file may get lost and we can't just call truncate_inode_pages_range() to invalidate the destination range as that will erase parts of a partial folio at each end whilst invalidating and discarding all the folios in the middle. We need to force all the folios covering the range to be reloaded, but we mustn't lose dirty data in them that's not in the destination range. Further, we shouldn't simply round out the range to PAGE_SIZE at each end as cifs should move to support multipage folios. Secondly, there's an issue whereby a write may have extended the file locally, but not have been written back yet. This can leaves the local idea of the EOF at a later point than the server's EOF. If a copy request is issued, this will fail on the server with STATUS_INVALID_VIEW_SIZE (which gets translated to -EIO locally) if the copy source extends past the server's EOF. Fix this by: (0) Flush the source region (already done). The flush does nothing and the EOF isn't moved if the source region has no dirty data. (1) Move the EOF to the end of the source region if it isn't already at least at this point. If we can't do this, for instance if the server doesn't support it, just flush the entire source file. (2) Find the folio (if present) at each end of the range, flushing it and increasing the region-to-be-invalidated to cover those in their entirety. (3) Fully discard all the folios covering the range as we want them to be reloaded. (4) Then perform the copy. Thirdly, set i_size after doing the copychunk_range operation as this value may be used by various things internally. stat() hides the issue because setting ->time to 0 causes cifs_getatr() to revalidate the attributes. These were causing the generic/075 xfstest to fail. Fixes: 620d8745b35d ("Introduce cifs_copy_file_range()") Cc: stable@vger.kernel.org Signed-off-by: David Howells <dhowells@redhat.com> cc: Paulo Alcantara <pc@manguebit.com> cc: Shyam Prasad N <nspmangalore@gmail.com> cc: Rohith Surabattula <rohiths.msft@gmail.com> cc: Matthew Wilcox <willy@infradead.org> cc: Jeff Layton <jlayton@kernel.org> cc: linux-cifs@vger.kernel.org cc: linux-mm@kvack.org Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: Steve French <stfrench@microsoft.com>
2023-12-01 00:22:00 +00:00
unsigned long long destend, fstart, fend;
ssize_t rc;
cifs_dbg(FYI, "copychunk range\n");
if (!src_file->private_data || !dst_file->private_data) {
rc = -EBADF;
cifs_dbg(VFS, "missing cifsFileInfo on copy range src file\n");
goto out;
}
rc = -EXDEV;
smb_file_target = dst_file->private_data;
smb_file_src = src_file->private_data;
src_tcon = tlink_tcon(smb_file_src->tlink);
target_tcon = tlink_tcon(smb_file_target->tlink);
if (src_tcon->ses != target_tcon->ses) {
cifs_dbg(VFS, "source and target of copy not on same server\n");
goto out;
}
rc = -EOPNOTSUPP;
if (!target_tcon->ses->server->ops->copychunk_range)
goto out;
/*
* Note: cifs case is easier than btrfs since server responsible for
* checks for proper open modes and file type and if it wants
* server could even support copy of range where source = target
*/
lock_two_nondirectories(target_inode, src_inode);
cifs_dbg(FYI, "about to flush pages\n");
rc = filemap_write_and_wait_range(src_inode->i_mapping, off,
off + len - 1);
if (rc)
goto unlock;
cifs: Fix flushing, invalidation and file size with copy_file_range() Fix a number of issues in the cifs filesystem implementation of the copy_file_range() syscall in cifs_file_copychunk_range(). Firstly, the invalidation of the destination range is handled incorrectly: We shouldn't just invalidate the whole file as dirty data in the file may get lost and we can't just call truncate_inode_pages_range() to invalidate the destination range as that will erase parts of a partial folio at each end whilst invalidating and discarding all the folios in the middle. We need to force all the folios covering the range to be reloaded, but we mustn't lose dirty data in them that's not in the destination range. Further, we shouldn't simply round out the range to PAGE_SIZE at each end as cifs should move to support multipage folios. Secondly, there's an issue whereby a write may have extended the file locally, but not have been written back yet. This can leaves the local idea of the EOF at a later point than the server's EOF. If a copy request is issued, this will fail on the server with STATUS_INVALID_VIEW_SIZE (which gets translated to -EIO locally) if the copy source extends past the server's EOF. Fix this by: (0) Flush the source region (already done). The flush does nothing and the EOF isn't moved if the source region has no dirty data. (1) Move the EOF to the end of the source region if it isn't already at least at this point. If we can't do this, for instance if the server doesn't support it, just flush the entire source file. (2) Find the folio (if present) at each end of the range, flushing it and increasing the region-to-be-invalidated to cover those in their entirety. (3) Fully discard all the folios covering the range as we want them to be reloaded. (4) Then perform the copy. Thirdly, set i_size after doing the copychunk_range operation as this value may be used by various things internally. stat() hides the issue because setting ->time to 0 causes cifs_getatr() to revalidate the attributes. These were causing the generic/075 xfstest to fail. Fixes: 620d8745b35d ("Introduce cifs_copy_file_range()") Cc: stable@vger.kernel.org Signed-off-by: David Howells <dhowells@redhat.com> cc: Paulo Alcantara <pc@manguebit.com> cc: Shyam Prasad N <nspmangalore@gmail.com> cc: Rohith Surabattula <rohiths.msft@gmail.com> cc: Matthew Wilcox <willy@infradead.org> cc: Jeff Layton <jlayton@kernel.org> cc: linux-cifs@vger.kernel.org cc: linux-mm@kvack.org Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: Steve French <stfrench@microsoft.com>
2023-12-01 00:22:00 +00:00
/* The server-side copy will fail if the source crosses the EOF marker.
* Advance the EOF marker after the flush above to the end of the range
* if it's short of that.
*/
if (src_cifsi->netfs.remote_i_size < off + len) {
cifs: Fix flushing, invalidation and file size with copy_file_range() Fix a number of issues in the cifs filesystem implementation of the copy_file_range() syscall in cifs_file_copychunk_range(). Firstly, the invalidation of the destination range is handled incorrectly: We shouldn't just invalidate the whole file as dirty data in the file may get lost and we can't just call truncate_inode_pages_range() to invalidate the destination range as that will erase parts of a partial folio at each end whilst invalidating and discarding all the folios in the middle. We need to force all the folios covering the range to be reloaded, but we mustn't lose dirty data in them that's not in the destination range. Further, we shouldn't simply round out the range to PAGE_SIZE at each end as cifs should move to support multipage folios. Secondly, there's an issue whereby a write may have extended the file locally, but not have been written back yet. This can leaves the local idea of the EOF at a later point than the server's EOF. If a copy request is issued, this will fail on the server with STATUS_INVALID_VIEW_SIZE (which gets translated to -EIO locally) if the copy source extends past the server's EOF. Fix this by: (0) Flush the source region (already done). The flush does nothing and the EOF isn't moved if the source region has no dirty data. (1) Move the EOF to the end of the source region if it isn't already at least at this point. If we can't do this, for instance if the server doesn't support it, just flush the entire source file. (2) Find the folio (if present) at each end of the range, flushing it and increasing the region-to-be-invalidated to cover those in their entirety. (3) Fully discard all the folios covering the range as we want them to be reloaded. (4) Then perform the copy. Thirdly, set i_size after doing the copychunk_range operation as this value may be used by various things internally. stat() hides the issue because setting ->time to 0 causes cifs_getatr() to revalidate the attributes. These were causing the generic/075 xfstest to fail. Fixes: 620d8745b35d ("Introduce cifs_copy_file_range()") Cc: stable@vger.kernel.org Signed-off-by: David Howells <dhowells@redhat.com> cc: Paulo Alcantara <pc@manguebit.com> cc: Shyam Prasad N <nspmangalore@gmail.com> cc: Rohith Surabattula <rohiths.msft@gmail.com> cc: Matthew Wilcox <willy@infradead.org> cc: Jeff Layton <jlayton@kernel.org> cc: linux-cifs@vger.kernel.org cc: linux-mm@kvack.org Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: Steve French <stfrench@microsoft.com>
2023-12-01 00:22:00 +00:00
rc = cifs_precopy_set_eof(src_inode, src_cifsi, src_tcon, xid, off + len);
if (rc < 0)
goto unlock;
}
destend = destoff + len - 1;
/* Flush the folios at either end of the destination range to prevent
* accidental loss of dirty data outside of the range.
*/
fstart = destoff;
fend = destend;
rc = cifs_flush_folio(target_inode, destoff, &fstart, &fend, true);
if (rc)
goto unlock;
rc = cifs_flush_folio(target_inode, destend, &fstart, &fend, false);
if (rc)
goto unlock;
/* Discard all the folios that overlap the destination region. */
truncate_inode_pages_range(&target_inode->i_data, fstart, fend);
fscache_invalidate(cifs_inode_cookie(target_inode), NULL,
i_size_read(target_inode), 0);
rc = file_modified(dst_file);
cifs: Fix flushing, invalidation and file size with copy_file_range() Fix a number of issues in the cifs filesystem implementation of the copy_file_range() syscall in cifs_file_copychunk_range(). Firstly, the invalidation of the destination range is handled incorrectly: We shouldn't just invalidate the whole file as dirty data in the file may get lost and we can't just call truncate_inode_pages_range() to invalidate the destination range as that will erase parts of a partial folio at each end whilst invalidating and discarding all the folios in the middle. We need to force all the folios covering the range to be reloaded, but we mustn't lose dirty data in them that's not in the destination range. Further, we shouldn't simply round out the range to PAGE_SIZE at each end as cifs should move to support multipage folios. Secondly, there's an issue whereby a write may have extended the file locally, but not have been written back yet. This can leaves the local idea of the EOF at a later point than the server's EOF. If a copy request is issued, this will fail on the server with STATUS_INVALID_VIEW_SIZE (which gets translated to -EIO locally) if the copy source extends past the server's EOF. Fix this by: (0) Flush the source region (already done). The flush does nothing and the EOF isn't moved if the source region has no dirty data. (1) Move the EOF to the end of the source region if it isn't already at least at this point. If we can't do this, for instance if the server doesn't support it, just flush the entire source file. (2) Find the folio (if present) at each end of the range, flushing it and increasing the region-to-be-invalidated to cover those in their entirety. (3) Fully discard all the folios covering the range as we want them to be reloaded. (4) Then perform the copy. Thirdly, set i_size after doing the copychunk_range operation as this value may be used by various things internally. stat() hides the issue because setting ->time to 0 causes cifs_getatr() to revalidate the attributes. These were causing the generic/075 xfstest to fail. Fixes: 620d8745b35d ("Introduce cifs_copy_file_range()") Cc: stable@vger.kernel.org Signed-off-by: David Howells <dhowells@redhat.com> cc: Paulo Alcantara <pc@manguebit.com> cc: Shyam Prasad N <nspmangalore@gmail.com> cc: Rohith Surabattula <rohiths.msft@gmail.com> cc: Matthew Wilcox <willy@infradead.org> cc: Jeff Layton <jlayton@kernel.org> cc: linux-cifs@vger.kernel.org cc: linux-mm@kvack.org Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: Steve French <stfrench@microsoft.com>
2023-12-01 00:22:00 +00:00
if (!rc) {
rc = target_tcon->ses->server->ops->copychunk_range(xid,
smb_file_src, smb_file_target, off, len, destoff);
if (rc > 0 && destoff + rc > i_size_read(target_inode)) {
cifs: Fix flushing, invalidation and file size with copy_file_range() Fix a number of issues in the cifs filesystem implementation of the copy_file_range() syscall in cifs_file_copychunk_range(). Firstly, the invalidation of the destination range is handled incorrectly: We shouldn't just invalidate the whole file as dirty data in the file may get lost and we can't just call truncate_inode_pages_range() to invalidate the destination range as that will erase parts of a partial folio at each end whilst invalidating and discarding all the folios in the middle. We need to force all the folios covering the range to be reloaded, but we mustn't lose dirty data in them that's not in the destination range. Further, we shouldn't simply round out the range to PAGE_SIZE at each end as cifs should move to support multipage folios. Secondly, there's an issue whereby a write may have extended the file locally, but not have been written back yet. This can leaves the local idea of the EOF at a later point than the server's EOF. If a copy request is issued, this will fail on the server with STATUS_INVALID_VIEW_SIZE (which gets translated to -EIO locally) if the copy source extends past the server's EOF. Fix this by: (0) Flush the source region (already done). The flush does nothing and the EOF isn't moved if the source region has no dirty data. (1) Move the EOF to the end of the source region if it isn't already at least at this point. If we can't do this, for instance if the server doesn't support it, just flush the entire source file. (2) Find the folio (if present) at each end of the range, flushing it and increasing the region-to-be-invalidated to cover those in their entirety. (3) Fully discard all the folios covering the range as we want them to be reloaded. (4) Then perform the copy. Thirdly, set i_size after doing the copychunk_range operation as this value may be used by various things internally. stat() hides the issue because setting ->time to 0 causes cifs_getatr() to revalidate the attributes. These were causing the generic/075 xfstest to fail. Fixes: 620d8745b35d ("Introduce cifs_copy_file_range()") Cc: stable@vger.kernel.org Signed-off-by: David Howells <dhowells@redhat.com> cc: Paulo Alcantara <pc@manguebit.com> cc: Shyam Prasad N <nspmangalore@gmail.com> cc: Rohith Surabattula <rohiths.msft@gmail.com> cc: Matthew Wilcox <willy@infradead.org> cc: Jeff Layton <jlayton@kernel.org> cc: linux-cifs@vger.kernel.org cc: linux-mm@kvack.org Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: Steve French <stfrench@microsoft.com>
2023-12-01 00:22:00 +00:00
truncate_setsize(target_inode, destoff + rc);
netfs_resize_file(&target_cifsi->netfs,
i_size_read(target_inode), true);
fscache_resize_cookie(cifs_inode_cookie(target_inode),
i_size_read(target_inode));
}
if (rc > 0 && destoff + rc > target_cifsi->netfs.zero_point)
target_cifsi->netfs.zero_point = destoff + rc;
cifs: Fix flushing, invalidation and file size with copy_file_range() Fix a number of issues in the cifs filesystem implementation of the copy_file_range() syscall in cifs_file_copychunk_range(). Firstly, the invalidation of the destination range is handled incorrectly: We shouldn't just invalidate the whole file as dirty data in the file may get lost and we can't just call truncate_inode_pages_range() to invalidate the destination range as that will erase parts of a partial folio at each end whilst invalidating and discarding all the folios in the middle. We need to force all the folios covering the range to be reloaded, but we mustn't lose dirty data in them that's not in the destination range. Further, we shouldn't simply round out the range to PAGE_SIZE at each end as cifs should move to support multipage folios. Secondly, there's an issue whereby a write may have extended the file locally, but not have been written back yet. This can leaves the local idea of the EOF at a later point than the server's EOF. If a copy request is issued, this will fail on the server with STATUS_INVALID_VIEW_SIZE (which gets translated to -EIO locally) if the copy source extends past the server's EOF. Fix this by: (0) Flush the source region (already done). The flush does nothing and the EOF isn't moved if the source region has no dirty data. (1) Move the EOF to the end of the source region if it isn't already at least at this point. If we can't do this, for instance if the server doesn't support it, just flush the entire source file. (2) Find the folio (if present) at each end of the range, flushing it and increasing the region-to-be-invalidated to cover those in their entirety. (3) Fully discard all the folios covering the range as we want them to be reloaded. (4) Then perform the copy. Thirdly, set i_size after doing the copychunk_range operation as this value may be used by various things internally. stat() hides the issue because setting ->time to 0 causes cifs_getatr() to revalidate the attributes. These were causing the generic/075 xfstest to fail. Fixes: 620d8745b35d ("Introduce cifs_copy_file_range()") Cc: stable@vger.kernel.org Signed-off-by: David Howells <dhowells@redhat.com> cc: Paulo Alcantara <pc@manguebit.com> cc: Shyam Prasad N <nspmangalore@gmail.com> cc: Rohith Surabattula <rohiths.msft@gmail.com> cc: Matthew Wilcox <willy@infradead.org> cc: Jeff Layton <jlayton@kernel.org> cc: linux-cifs@vger.kernel.org cc: linux-mm@kvack.org Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: Steve French <stfrench@microsoft.com>
2023-12-01 00:22:00 +00:00
}
file_accessed(src_file);
/* force revalidate of size and timestamps of target file now
* that target is updated on the server
*/
CIFS_I(target_inode)->time = 0;
unlock:
/* although unlocking in the reverse order from locking is not
* strictly necessary here it is a little cleaner to be consistent
*/
unlock_two_nondirectories(src_inode, target_inode);
out:
return rc;
}
/*
* Directory operations under CIFS/SMB2/SMB3 are synchronous, so fsync()
* is a dummy operation.
*/
static int cifs_dir_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
cifs_dbg(FYI, "Sync directory - name: %pD datasync: 0x%x\n",
file, datasync);
return 0;
}
static ssize_t cifs_copy_file_range(struct file *src_file, loff_t off,
struct file *dst_file, loff_t destoff,
size_t len, unsigned int flags)
{
unsigned int xid = get_xid();
ssize_t rc;
struct cifsFileInfo *cfile = dst_file->private_data;
if (cfile->swapfile) {
rc = -EOPNOTSUPP;
free_xid(xid);
return rc;
}
rc = cifs_file_copychunk_range(xid, src_file, off, dst_file, destoff,
len, flags);
free_xid(xid);
if (rc == -EOPNOTSUPP || rc == -EXDEV)
rc = splice_copy_file_range(src_file, off, dst_file,
destoff, len);
return rc;
}
const struct file_operations cifs_file_ops = {
.read_iter = cifs_loose_read_iter,
.write_iter = cifs_file_write_iter,
.open = cifs_open,
.release = cifs_close,
.lock = cifs_lock,
.flock = cifs_flock,
.fsync = cifs_fsync,
.flush = cifs_flush,
.mmap = cifs_file_mmap,
.splice_read = filemap_splice_read,
.splice_write = iter_file_splice_write,
.llseek = cifs_llseek,
.unlocked_ioctl = cifs_ioctl,
.copy_file_range = cifs_copy_file_range,
.remap_file_range = cifs_remap_file_range,
.setlease = cifs_setlease,
.fallocate = cifs_fallocate,
};
const struct file_operations cifs_file_strict_ops = {
.read_iter = cifs_strict_readv,
.write_iter = cifs_strict_writev,
.open = cifs_open,
.release = cifs_close,
.lock = cifs_lock,
.flock = cifs_flock,
.fsync = cifs_strict_fsync,
.flush = cifs_flush,
.mmap = cifs_file_strict_mmap,
.splice_read = filemap_splice_read,
.splice_write = iter_file_splice_write,
.llseek = cifs_llseek,
.unlocked_ioctl = cifs_ioctl,
.copy_file_range = cifs_copy_file_range,
.remap_file_range = cifs_remap_file_range,
.setlease = cifs_setlease,
.fallocate = cifs_fallocate,
};
const struct file_operations cifs_file_direct_ops = {
.read_iter = cifs_direct_readv,
.write_iter = cifs_direct_writev,
.open = cifs_open,
.release = cifs_close,
.lock = cifs_lock,
.flock = cifs_flock,
.fsync = cifs_fsync,
.flush = cifs_flush,
.mmap = cifs_file_mmap,
.splice_read = copy_splice_read,
.splice_write = iter_file_splice_write,
.unlocked_ioctl = cifs_ioctl,
.copy_file_range = cifs_copy_file_range,
.remap_file_range = cifs_remap_file_range,
.llseek = cifs_llseek,
.setlease = cifs_setlease,
.fallocate = cifs_fallocate,
};
const struct file_operations cifs_file_nobrl_ops = {
.read_iter = cifs_loose_read_iter,
.write_iter = cifs_file_write_iter,
.open = cifs_open,
.release = cifs_close,
.fsync = cifs_fsync,
.flush = cifs_flush,
.mmap = cifs_file_mmap,
.splice_read = filemap_splice_read,
.splice_write = iter_file_splice_write,
.llseek = cifs_llseek,
.unlocked_ioctl = cifs_ioctl,
.copy_file_range = cifs_copy_file_range,
.remap_file_range = cifs_remap_file_range,
.setlease = cifs_setlease,
.fallocate = cifs_fallocate,
};
const struct file_operations cifs_file_strict_nobrl_ops = {
.read_iter = cifs_strict_readv,
.write_iter = cifs_strict_writev,
.open = cifs_open,
.release = cifs_close,
.fsync = cifs_strict_fsync,
.flush = cifs_flush,
.mmap = cifs_file_strict_mmap,
.splice_read = filemap_splice_read,
.splice_write = iter_file_splice_write,
.llseek = cifs_llseek,
.unlocked_ioctl = cifs_ioctl,
.copy_file_range = cifs_copy_file_range,
.remap_file_range = cifs_remap_file_range,
.setlease = cifs_setlease,
.fallocate = cifs_fallocate,
};
const struct file_operations cifs_file_direct_nobrl_ops = {
.read_iter = cifs_direct_readv,
.write_iter = cifs_direct_writev,
.open = cifs_open,
.release = cifs_close,
.fsync = cifs_fsync,
.flush = cifs_flush,
.mmap = cifs_file_mmap,
.splice_read = copy_splice_read,
.splice_write = iter_file_splice_write,
.unlocked_ioctl = cifs_ioctl,
.copy_file_range = cifs_copy_file_range,
.remap_file_range = cifs_remap_file_range,
.llseek = cifs_llseek,
.setlease = cifs_setlease,
.fallocate = cifs_fallocate,
};
const struct file_operations cifs_dir_ops = {
.iterate_shared = cifs_readdir,
.release = cifs_closedir,
.read = generic_read_dir,
.unlocked_ioctl = cifs_ioctl,
.copy_file_range = cifs_copy_file_range,
.remap_file_range = cifs_remap_file_range,
.llseek = generic_file_llseek,
.fsync = cifs_dir_fsync,
};
static void
cifs_init_once(void *inode)
{
struct cifsInodeInfo *cifsi = inode;
netfs: Fix gcc-12 warning by embedding vfs inode in netfs_i_context While randstruct was satisfied with using an open-coded "void *" offset cast for the netfs_i_context <-> inode casting, __builtin_object_size() as used by FORTIFY_SOURCE was not as easily fooled. This was causing the following complaint[1] from gcc v12: In file included from include/linux/string.h:253, from include/linux/ceph/ceph_debug.h:7, from fs/ceph/inode.c:2: In function 'fortify_memset_chk', inlined from 'netfs_i_context_init' at include/linux/netfs.h:326:2, inlined from 'ceph_alloc_inode' at fs/ceph/inode.c:463:2: include/linux/fortify-string.h:242:25: warning: call to '__write_overflow_field' declared with attribute warning: detected write beyond size of field (1st parameter); maybe use struct_group()? [-Wattribute-warning] 242 | __write_overflow_field(p_size_field, size); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Fix this by embedding a struct inode into struct netfs_i_context (which should perhaps be renamed to struct netfs_inode). The struct inode vfs_inode fields are then removed from the 9p, afs, ceph and cifs inode structs and vfs_inode is then simply changed to "netfs.inode" in those filesystems. Further, rename netfs_i_context to netfs_inode, get rid of the netfs_inode() function that converted a netfs_i_context pointer to an inode pointer (that can now be done with &ctx->inode) and rename the netfs_i_context() function to netfs_inode() (which is now a wrapper around container_of()). Most of the changes were done with: perl -p -i -e 's/vfs_inode/netfs.inode/'g \ `git grep -l 'vfs_inode' -- fs/{9p,afs,ceph,cifs}/*.[ch]` Kees suggested doing it with a pair structure[2] and a special declarator to insert that into the network filesystem's inode wrapper[3], but I think it's cleaner to embed it - and then it doesn't matter if struct randomisation reorders things. Dave Chinner suggested using a filesystem-specific VFS_I() function in each filesystem to convert that filesystem's own inode wrapper struct into the VFS inode struct[4]. Version #2: - Fix a couple of missed name changes due to a disabled cifs option. - Rename nfs_i_context to nfs_inode - Use "netfs" instead of "nic" as the member name in per-fs inode wrapper structs. [ This also undoes commit 507160f46c55 ("netfs: gcc-12: temporarily disable '-Wattribute-warning' for now") that is no longer needed ] Fixes: bc899ee1c898 ("netfs: Add a netfs inode context") Reported-by: Jeff Layton <jlayton@kernel.org> Signed-off-by: David Howells <dhowells@redhat.com> Reviewed-by: Jeff Layton <jlayton@kernel.org> Reviewed-by: Kees Cook <keescook@chromium.org> Reviewed-by: Xiubo Li <xiubli@redhat.com> cc: Jonathan Corbet <corbet@lwn.net> cc: Eric Van Hensbergen <ericvh@gmail.com> cc: Latchesar Ionkov <lucho@ionkov.net> cc: Dominique Martinet <asmadeus@codewreck.org> cc: Christian Schoenebeck <linux_oss@crudebyte.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: Ilya Dryomov <idryomov@gmail.com> cc: Steve French <smfrench@gmail.com> cc: William Kucharski <william.kucharski@oracle.com> cc: "Matthew Wilcox (Oracle)" <willy@infradead.org> cc: Dave Chinner <david@fromorbit.com> cc: linux-doc@vger.kernel.org cc: v9fs-developer@lists.sourceforge.net cc: linux-afs@lists.infradead.org cc: ceph-devel@vger.kernel.org cc: linux-cifs@vger.kernel.org cc: samba-technical@lists.samba.org cc: linux-fsdevel@vger.kernel.org cc: linux-hardening@vger.kernel.org Link: https://lore.kernel.org/r/d2ad3a3d7bdd794c6efb562d2f2b655fb67756b9.camel@kernel.org/ [1] Link: https://lore.kernel.org/r/20220517210230.864239-1-keescook@chromium.org/ [2] Link: https://lore.kernel.org/r/20220518202212.2322058-1-keescook@chromium.org/ [3] Link: https://lore.kernel.org/r/20220524101205.GI2306852@dread.disaster.area/ [4] Link: https://lore.kernel.org/r/165296786831.3591209.12111293034669289733.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/165305805651.4094995.7763502506786714216.stgit@warthog.procyon.org.uk # v2 Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2022-06-09 20:46:04 +00:00
inode_init_once(&cifsi->netfs.inode);
init_rwsem(&cifsi->lock_sem);
}
static int __init
cifs_init_inodecache(void)
{
cifs_inode_cachep = kmem_cache_create("cifs_inode_cache",
sizeof(struct cifsInodeInfo),
0, (SLAB_RECLAIM_ACCOUNT|
2016-01-14 23:18:21 +00:00
SLAB_MEM_SPREAD|SLAB_ACCOUNT),
cifs_init_once);
if (cifs_inode_cachep == NULL)
return -ENOMEM;
return 0;
}
static void
cifs_destroy_inodecache(void)
{
/*
* Make sure all delayed rcu free inodes are flushed before we
* destroy cache.
*/
rcu_barrier();
kmem_cache_destroy(cifs_inode_cachep);
}
static int
cifs_init_request_bufs(void)
{
/*
* SMB2 maximum header size is bigger than CIFS one - no problems to
* allocate some more bytes for CIFS.
*/
size_t max_hdr_size = MAX_SMB2_HDR_SIZE;
if (CIFSMaxBufSize < 8192) {
/* Buffer size can not be smaller than 2 * PATH_MAX since maximum
Unicode path name has to fit in any SMB/CIFS path based frames */
CIFSMaxBufSize = 8192;
} else if (CIFSMaxBufSize > 1024*127) {
CIFSMaxBufSize = 1024 * 127;
} else {
CIFSMaxBufSize &= 0x1FE00; /* Round size to even 512 byte mult*/
}
/*
cifs_dbg(VFS, "CIFSMaxBufSize %d 0x%x\n",
CIFSMaxBufSize, CIFSMaxBufSize);
*/
cifs_req_cachep = kmem_cache_create_usercopy("cifs_request",
CIFSMaxBufSize + max_hdr_size, 0,
SLAB_HWCACHE_ALIGN, 0,
CIFSMaxBufSize + max_hdr_size,
NULL);
if (cifs_req_cachep == NULL)
return -ENOMEM;
if (cifs_min_rcv < 1)
cifs_min_rcv = 1;
else if (cifs_min_rcv > 64) {
cifs_min_rcv = 64;
cifs_dbg(VFS, "cifs_min_rcv set to maximum (64)\n");
}
cifs_req_poolp = mempool_create_slab_pool(cifs_min_rcv,
cifs_req_cachep);
if (cifs_req_poolp == NULL) {
kmem_cache_destroy(cifs_req_cachep);
return -ENOMEM;
}
/* MAX_CIFS_SMALL_BUFFER_SIZE bytes is enough for most SMB responses and
almost all handle based requests (but not write response, nor is it
sufficient for path based requests). A smaller size would have
been more efficient (compacting multiple slab items on one 4k page)
for the case in which debug was on, but this larger size allows
more SMBs to use small buffer alloc and is still much more
efficient to alloc 1 per page off the slab compared to 17K (5page)
alloc of large cifs buffers even when page debugging is on */
cifs_sm_req_cachep = kmem_cache_create_usercopy("cifs_small_rq",
MAX_CIFS_SMALL_BUFFER_SIZE, 0, SLAB_HWCACHE_ALIGN,
0, MAX_CIFS_SMALL_BUFFER_SIZE, NULL);
if (cifs_sm_req_cachep == NULL) {
mempool_destroy(cifs_req_poolp);
kmem_cache_destroy(cifs_req_cachep);
return -ENOMEM;
}
if (cifs_min_small < 2)
cifs_min_small = 2;
else if (cifs_min_small > 256) {
cifs_min_small = 256;
cifs_dbg(FYI, "cifs_min_small set to maximum (256)\n");
}
cifs_sm_req_poolp = mempool_create_slab_pool(cifs_min_small,
cifs_sm_req_cachep);
if (cifs_sm_req_poolp == NULL) {
mempool_destroy(cifs_req_poolp);
kmem_cache_destroy(cifs_req_cachep);
kmem_cache_destroy(cifs_sm_req_cachep);
return -ENOMEM;
}
return 0;
}
static void
cifs_destroy_request_bufs(void)
{
mempool_destroy(cifs_req_poolp);
kmem_cache_destroy(cifs_req_cachep);
mempool_destroy(cifs_sm_req_poolp);
kmem_cache_destroy(cifs_sm_req_cachep);
}
static int init_mids(void)
{
cifs_mid_cachep = kmem_cache_create("cifs_mpx_ids",
sizeof(struct mid_q_entry), 0,
SLAB_HWCACHE_ALIGN, NULL);
if (cifs_mid_cachep == NULL)
return -ENOMEM;
/* 3 is a reasonable minimum number of simultaneous operations */
cifs_mid_poolp = mempool_create_slab_pool(3, cifs_mid_cachep);
if (cifs_mid_poolp == NULL) {
kmem_cache_destroy(cifs_mid_cachep);
return -ENOMEM;
}
return 0;
}
static void destroy_mids(void)
{
mempool_destroy(cifs_mid_poolp);
kmem_cache_destroy(cifs_mid_cachep);
}
static int __init
init_cifs(void)
{
int rc = 0;
cifs_proc_init();
INIT_LIST_HEAD(&cifs_tcp_ses_list);
/*
* Initialize Global counters
*/
atomic_set(&sesInfoAllocCount, 0);
atomic_set(&tconInfoAllocCount, 0);
atomic_set(&tcpSesNextId, 0);
atomic_set(&tcpSesAllocCount, 0);
atomic_set(&tcpSesReconnectCount, 0);
atomic_set(&tconInfoReconnectCount, 0);
atomic_set(&buf_alloc_count, 0);
atomic_set(&small_buf_alloc_count, 0);
#ifdef CONFIG_CIFS_STATS2
atomic_set(&total_buf_alloc_count, 0);
atomic_set(&total_small_buf_alloc_count, 0);
if (slow_rsp_threshold < 1)
cifs_dbg(FYI, "slow_response_threshold msgs disabled\n");
else if (slow_rsp_threshold > 32767)
cifs_dbg(VFS,
"slow response threshold set higher than recommended (0 to 32767)\n");
#endif /* CONFIG_CIFS_STATS2 */
atomic_set(&mid_count, 0);
GlobalCurrentXid = 0;
GlobalTotalActiveXid = 0;
GlobalMaxActiveXid = 0;
spin_lock_init(&cifs_tcp_ses_lock);
spin_lock_init(&GlobalMid_Lock);
cifs_lock_secret = get_random_u32();
if (cifs_max_pending < 2) {
cifs_max_pending = 2;
cifs_dbg(FYI, "cifs_max_pending set to min of 2\n");
} else if (cifs_max_pending > CIFS_MAX_REQ) {
cifs_max_pending = CIFS_MAX_REQ;
cifs_dbg(FYI, "cifs_max_pending set to max of %u\n",
CIFS_MAX_REQ);
}
/* Limit max to about 18 hours, and setting to zero disables directory entry caching */
if (dir_cache_timeout > 65000) {
dir_cache_timeout = 65000;
cifs_dbg(VFS, "dir_cache_timeout set to max of 65000 seconds\n");
}
cifsiod_wq = alloc_workqueue("cifsiod", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
if (!cifsiod_wq) {
rc = -ENOMEM;
goto out_clean_proc;
}
/*
* Consider in future setting limit!=0 maybe to min(num_of_cores - 1, 3)
* so that we don't launch too many worker threads but
* Documentation/core-api/workqueue.rst recommends setting it to 0
*/
/* WQ_UNBOUND allows decrypt tasks to run on any CPU */
decrypt_wq = alloc_workqueue("smb3decryptd",
WQ_UNBOUND|WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
if (!decrypt_wq) {
rc = -ENOMEM;
goto out_destroy_cifsiod_wq;
}
cifs: move cifsFileInfo_put logic into a work-queue This patch moves the final part of the cifsFileInfo_put() logic where we need a write lock on lock_sem to be processed in a separate thread that holds no other locks. This is to prevent deadlocks like the one below: > there are 6 processes looping to while trying to down_write > cinode->lock_sem, 5 of them from _cifsFileInfo_put, and one from > cifs_new_fileinfo > > and there are 5 other processes which are blocked, several of them > waiting on either PG_writeback or PG_locked (which are both set), all > for the same page of the file > > 2 inode_lock() (inode->i_rwsem) for the file > 1 wait_on_page_writeback() for the page > 1 down_read(inode->i_rwsem) for the inode of the directory > 1 inode_lock()(inode->i_rwsem) for the inode of the directory > 1 __lock_page > > > so processes are blocked waiting on: > page flags PG_locked and PG_writeback for one specific page > inode->i_rwsem for the directory > inode->i_rwsem for the file > cifsInodeInflock_sem > > > > here are the more gory details (let me know if I need to provide > anything more/better): > > [0 00:48:22.765] [UN] PID: 8863 TASK: ffff8c691547c5c0 CPU: 3 > COMMAND: "reopen_file" > #0 [ffff9965007e3ba8] __schedule at ffffffff9b6e6095 > #1 [ffff9965007e3c38] schedule at ffffffff9b6e64df > #2 [ffff9965007e3c48] rwsem_down_write_slowpath at ffffffff9af283d7 > #3 [ffff9965007e3cb8] legitimize_path at ffffffff9b0f975d > #4 [ffff9965007e3d08] path_openat at ffffffff9b0fe55d > #5 [ffff9965007e3dd8] do_filp_open at ffffffff9b100a33 > #6 [ffff9965007e3ee0] do_sys_open at ffffffff9b0eb2d6 > #7 [ffff9965007e3f38] do_syscall_64 at ffffffff9ae04315 > * (I think legitimize_path is bogus) > > in path_openat > } else { > const char *s = path_init(nd, flags); > while (!(error = link_path_walk(s, nd)) && > (error = do_last(nd, file, op)) > 0) { <<<< > > do_last: > if (open_flag & O_CREAT) > inode_lock(dir->d_inode); <<<< > else > so it's trying to take inode->i_rwsem for the directory > > DENTRY INODE SUPERBLK TYPE PATH > ffff8c68bb8e79c0 ffff8c691158ef20 ffff8c6915bf9000 DIR /mnt/vm1_smb/ > inode.i_rwsem is ffff8c691158efc0 > > <struct rw_semaphore 0xffff8c691158efc0>: > owner: <struct task_struct 0xffff8c6914275d00> (UN - 8856 - > reopen_file), counter: 0x0000000000000003 > waitlist: 2 > 0xffff9965007e3c90 8863 reopen_file UN 0 1:29:22.926 > RWSEM_WAITING_FOR_WRITE > 0xffff996500393e00 9802 ls UN 0 1:17:26.700 > RWSEM_WAITING_FOR_READ > > > the owner of the inode.i_rwsem of the directory is: > > [0 00:00:00.109] [UN] PID: 8856 TASK: ffff8c6914275d00 CPU: 3 > COMMAND: "reopen_file" > #0 [ffff99650065b828] __schedule at ffffffff9b6e6095 > #1 [ffff99650065b8b8] schedule at ffffffff9b6e64df > #2 [ffff99650065b8c8] schedule_timeout at ffffffff9b6e9f89 > #3 [ffff99650065b940] msleep at ffffffff9af573a9 > #4 [ffff99650065b948] _cifsFileInfo_put.cold.63 at ffffffffc0a42dd6 [cifs] > #5 [ffff99650065ba38] cifs_writepage_locked at ffffffffc0a0b8f3 [cifs] > #6 [ffff99650065bab0] cifs_launder_page at ffffffffc0a0bb72 [cifs] > #7 [ffff99650065bb30] invalidate_inode_pages2_range at ffffffff9b04d4bd > #8 [ffff99650065bcb8] cifs_invalidate_mapping at ffffffffc0a11339 [cifs] > #9 [ffff99650065bcd0] cifs_revalidate_mapping at ffffffffc0a1139a [cifs] > #10 [ffff99650065bcf0] cifs_d_revalidate at ffffffffc0a014f6 [cifs] > #11 [ffff99650065bd08] path_openat at ffffffff9b0fe7f7 > #12 [ffff99650065bdd8] do_filp_open at ffffffff9b100a33 > #13 [ffff99650065bee0] do_sys_open at ffffffff9b0eb2d6 > #14 [ffff99650065bf38] do_syscall_64 at ffffffff9ae04315 > > cifs_launder_page is for page 0xffffd1e2c07d2480 > > crash> page.index,mapping,flags 0xffffd1e2c07d2480 > index = 0x8 > mapping = 0xffff8c68f3cd0db0 > flags = 0xfffffc0008095 > > PAGE-FLAG BIT VALUE > PG_locked 0 0000001 > PG_uptodate 2 0000004 > PG_lru 4 0000010 > PG_waiters 7 0000080 > PG_writeback 15 0008000 > > > inode is ffff8c68f3cd0c40 > inode.i_rwsem is ffff8c68f3cd0ce0 > DENTRY INODE SUPERBLK TYPE PATH > ffff8c68a1f1b480 ffff8c68f3cd0c40 ffff8c6915bf9000 REG > /mnt/vm1_smb/testfile.8853 > > > this process holds the inode->i_rwsem for the parent directory, is > laundering a page attached to the inode of the file it's opening, and in > _cifsFileInfo_put is trying to down_write the cifsInodeInflock_sem > for the file itself. > > > <struct rw_semaphore 0xffff8c68f3cd0ce0>: > owner: <struct task_struct 0xffff8c6914272e80> (UN - 8854 - > reopen_file), counter: 0x0000000000000003 > waitlist: 1 > 0xffff9965005dfd80 8855 reopen_file UN 0 1:29:22.912 > RWSEM_WAITING_FOR_WRITE > > this is the inode.i_rwsem for the file > > the owner: > > [0 00:48:22.739] [UN] PID: 8854 TASK: ffff8c6914272e80 CPU: 2 > COMMAND: "reopen_file" > #0 [ffff99650054fb38] __schedule at ffffffff9b6e6095 > #1 [ffff99650054fbc8] schedule at ffffffff9b6e64df > #2 [ffff99650054fbd8] io_schedule at ffffffff9b6e68e2 > #3 [ffff99650054fbe8] __lock_page at ffffffff9b03c56f > #4 [ffff99650054fc80] pagecache_get_page at ffffffff9b03dcdf > #5 [ffff99650054fcc0] grab_cache_page_write_begin at ffffffff9b03ef4c > #6 [ffff99650054fcd0] cifs_write_begin at ffffffffc0a064ec [cifs] > #7 [ffff99650054fd30] generic_perform_write at ffffffff9b03bba4 > #8 [ffff99650054fda8] __generic_file_write_iter at ffffffff9b04060a > #9 [ffff99650054fdf0] cifs_strict_writev.cold.70 at ffffffffc0a4469b [cifs] > #10 [ffff99650054fe48] new_sync_write at ffffffff9b0ec1dd > #11 [ffff99650054fed0] vfs_write at ffffffff9b0eed35 > #12 [ffff99650054ff00] ksys_write at ffffffff9b0eefd9 > #13 [ffff99650054ff38] do_syscall_64 at ffffffff9ae04315 > > the process holds the inode->i_rwsem for the file to which it's writing, > and is trying to __lock_page for the same page as in the other processes > > > the other tasks: > [0 00:00:00.028] [UN] PID: 8859 TASK: ffff8c6915479740 CPU: 2 > COMMAND: "reopen_file" > #0 [ffff9965007b39d8] __schedule at ffffffff9b6e6095 > #1 [ffff9965007b3a68] schedule at ffffffff9b6e64df > #2 [ffff9965007b3a78] schedule_timeout at ffffffff9b6e9f89 > #3 [ffff9965007b3af0] msleep at ffffffff9af573a9 > #4 [ffff9965007b3af8] cifs_new_fileinfo.cold.61 at ffffffffc0a42a07 [cifs] > #5 [ffff9965007b3b78] cifs_open at ffffffffc0a0709d [cifs] > #6 [ffff9965007b3cd8] do_dentry_open at ffffffff9b0e9b7a > #7 [ffff9965007b3d08] path_openat at ffffffff9b0fe34f > #8 [ffff9965007b3dd8] do_filp_open at ffffffff9b100a33 > #9 [ffff9965007b3ee0] do_sys_open at ffffffff9b0eb2d6 > #10 [ffff9965007b3f38] do_syscall_64 at ffffffff9ae04315 > > this is opening the file, and is trying to down_write cinode->lock_sem > > > [0 00:00:00.041] [UN] PID: 8860 TASK: ffff8c691547ae80 CPU: 2 > COMMAND: "reopen_file" > [0 00:00:00.057] [UN] PID: 8861 TASK: ffff8c6915478000 CPU: 3 > COMMAND: "reopen_file" > [0 00:00:00.059] [UN] PID: 8858 TASK: ffff8c6914271740 CPU: 2 > COMMAND: "reopen_file" > [0 00:00:00.109] [UN] PID: 8862 TASK: ffff8c691547dd00 CPU: 6 > COMMAND: "reopen_file" > #0 [ffff9965007c3c78] __schedule at ffffffff9b6e6095 > #1 [ffff9965007c3d08] schedule at ffffffff9b6e64df > #2 [ffff9965007c3d18] schedule_timeout at ffffffff9b6e9f89 > #3 [ffff9965007c3d90] msleep at ffffffff9af573a9 > #4 [ffff9965007c3d98] _cifsFileInfo_put.cold.63 at ffffffffc0a42dd6 [cifs] > #5 [ffff9965007c3e88] cifs_close at ffffffffc0a07aaf [cifs] > #6 [ffff9965007c3ea0] __fput at ffffffff9b0efa6e > #7 [ffff9965007c3ee8] task_work_run at ffffffff9aef1614 > #8 [ffff9965007c3f20] exit_to_usermode_loop at ffffffff9ae03d6f > #9 [ffff9965007c3f38] do_syscall_64 at ffffffff9ae0444c > > closing the file, and trying to down_write cifsi->lock_sem > > > [0 00:48:22.839] [UN] PID: 8857 TASK: ffff8c6914270000 CPU: 7 > COMMAND: "reopen_file" > #0 [ffff9965006a7cc8] __schedule at ffffffff9b6e6095 > #1 [ffff9965006a7d58] schedule at ffffffff9b6e64df > #2 [ffff9965006a7d68] io_schedule at ffffffff9b6e68e2 > #3 [ffff9965006a7d78] wait_on_page_bit at ffffffff9b03cac6 > #4 [ffff9965006a7e10] __filemap_fdatawait_range at ffffffff9b03b028 > #5 [ffff9965006a7ed8] filemap_write_and_wait at ffffffff9b040165 > #6 [ffff9965006a7ef0] cifs_flush at ffffffffc0a0c2fa [cifs] > #7 [ffff9965006a7f10] filp_close at ffffffff9b0e93f1 > #8 [ffff9965006a7f30] __x64_sys_close at ffffffff9b0e9a0e > #9 [ffff9965006a7f38] do_syscall_64 at ffffffff9ae04315 > > in __filemap_fdatawait_range > wait_on_page_writeback(page); > for the same page of the file > > > > [0 00:48:22.718] [UN] PID: 8855 TASK: ffff8c69142745c0 CPU: 7 > COMMAND: "reopen_file" > #0 [ffff9965005dfc98] __schedule at ffffffff9b6e6095 > #1 [ffff9965005dfd28] schedule at ffffffff9b6e64df > #2 [ffff9965005dfd38] rwsem_down_write_slowpath at ffffffff9af283d7 > #3 [ffff9965005dfdf0] cifs_strict_writev at ffffffffc0a0c40a [cifs] > #4 [ffff9965005dfe48] new_sync_write at ffffffff9b0ec1dd > #5 [ffff9965005dfed0] vfs_write at ffffffff9b0eed35 > #6 [ffff9965005dff00] ksys_write at ffffffff9b0eefd9 > #7 [ffff9965005dff38] do_syscall_64 at ffffffff9ae04315 > > inode_lock(inode); > > > and one 'ls' later on, to see whether the rest of the mount is available > (the test file is in the root, so we get blocked up on the directory > ->i_rwsem), so the entire mount is unavailable > > [0 00:36:26.473] [UN] PID: 9802 TASK: ffff8c691436ae80 CPU: 4 > COMMAND: "ls" > #0 [ffff996500393d28] __schedule at ffffffff9b6e6095 > #1 [ffff996500393db8] schedule at ffffffff9b6e64df > #2 [ffff996500393dc8] rwsem_down_read_slowpath at ffffffff9b6e9421 > #3 [ffff996500393e78] down_read_killable at ffffffff9b6e95e2 > #4 [ffff996500393e88] iterate_dir at ffffffff9b103c56 > #5 [ffff996500393ec8] ksys_getdents64 at ffffffff9b104b0c > #6 [ffff996500393f30] __x64_sys_getdents64 at ffffffff9b104bb6 > #7 [ffff996500393f38] do_syscall_64 at ffffffff9ae04315 > > in iterate_dir: > if (shared) > res = down_read_killable(&inode->i_rwsem); <<<< > else > res = down_write_killable(&inode->i_rwsem); > Reported-by: Frank Sorenson <sorenson@redhat.com> Reviewed-by: Pavel Shilovsky <pshilov@microsoft.com> Signed-off-by: Ronnie Sahlberg <lsahlber@redhat.com> Signed-off-by: Steve French <stfrench@microsoft.com>
2019-11-03 03:06:37 +00:00
fileinfo_put_wq = alloc_workqueue("cifsfileinfoput",
WQ_UNBOUND|WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
if (!fileinfo_put_wq) {
rc = -ENOMEM;
goto out_destroy_decrypt_wq;
}
CIFS: fix oplock break deadlocks When the final cifsFileInfo_put() is called from cifsiod and an oplock break work is queued, lockdep complains loudly: ============================================= [ INFO: possible recursive locking detected ] 4.11.0+ #21 Not tainted --------------------------------------------- kworker/0:2/78 is trying to acquire lock: ("cifsiod"){++++.+}, at: flush_work+0x215/0x350 but task is already holding lock: ("cifsiod"){++++.+}, at: process_one_work+0x255/0x8e0 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock("cifsiod"); lock("cifsiod"); *** DEADLOCK *** May be due to missing lock nesting notation 2 locks held by kworker/0:2/78: #0: ("cifsiod"){++++.+}, at: process_one_work+0x255/0x8e0 #1: ((&wdata->work)){+.+...}, at: process_one_work+0x255/0x8e0 stack backtrace: CPU: 0 PID: 78 Comm: kworker/0:2 Not tainted 4.11.0+ #21 Workqueue: cifsiod cifs_writev_complete Call Trace: dump_stack+0x85/0xc2 __lock_acquire+0x17dd/0x2260 ? match_held_lock+0x20/0x2b0 ? trace_hardirqs_off_caller+0x86/0x130 ? mark_lock+0xa6/0x920 lock_acquire+0xcc/0x260 ? lock_acquire+0xcc/0x260 ? flush_work+0x215/0x350 flush_work+0x236/0x350 ? flush_work+0x215/0x350 ? destroy_worker+0x170/0x170 __cancel_work_timer+0x17d/0x210 ? ___preempt_schedule+0x16/0x18 cancel_work_sync+0x10/0x20 cifsFileInfo_put+0x338/0x7f0 cifs_writedata_release+0x2a/0x40 ? cifs_writedata_release+0x2a/0x40 cifs_writev_complete+0x29d/0x850 ? preempt_count_sub+0x18/0xd0 process_one_work+0x304/0x8e0 worker_thread+0x9b/0x6a0 kthread+0x1b2/0x200 ? process_one_work+0x8e0/0x8e0 ? kthread_create_on_node+0x40/0x40 ret_from_fork+0x31/0x40 This is a real warning. Since the oplock is queued on the same workqueue this can deadlock if there is only one worker thread active for the workqueue (which will be the case during memory pressure when the rescuer thread is handling it). Furthermore, there is at least one other kind of hang possible due to the oplock break handling if there is only worker. (This can be reproduced without introducing memory pressure by having passing 1 for the max_active parameter of cifsiod.) cifs_oplock_break() can wait indefintely in the filemap_fdatawait() while the cifs_writev_complete() work is blocked: sysrq: SysRq : Show Blocked State task PC stack pid father kworker/0:1 D 0 16 2 0x00000000 Workqueue: cifsiod cifs_oplock_break Call Trace: __schedule+0x562/0xf40 ? mark_held_locks+0x4a/0xb0 schedule+0x57/0xe0 io_schedule+0x21/0x50 wait_on_page_bit+0x143/0x190 ? add_to_page_cache_lru+0x150/0x150 __filemap_fdatawait_range+0x134/0x190 ? do_writepages+0x51/0x70 filemap_fdatawait_range+0x14/0x30 filemap_fdatawait+0x3b/0x40 cifs_oplock_break+0x651/0x710 ? preempt_count_sub+0x18/0xd0 process_one_work+0x304/0x8e0 worker_thread+0x9b/0x6a0 kthread+0x1b2/0x200 ? process_one_work+0x8e0/0x8e0 ? kthread_create_on_node+0x40/0x40 ret_from_fork+0x31/0x40 dd D 0 683 171 0x00000000 Call Trace: __schedule+0x562/0xf40 ? mark_held_locks+0x29/0xb0 schedule+0x57/0xe0 io_schedule+0x21/0x50 wait_on_page_bit+0x143/0x190 ? add_to_page_cache_lru+0x150/0x150 __filemap_fdatawait_range+0x134/0x190 ? do_writepages+0x51/0x70 filemap_fdatawait_range+0x14/0x30 filemap_fdatawait+0x3b/0x40 filemap_write_and_wait+0x4e/0x70 cifs_flush+0x6a/0xb0 filp_close+0x52/0xa0 __close_fd+0xdc/0x150 SyS_close+0x33/0x60 entry_SYSCALL_64_fastpath+0x1f/0xbe Showing all locks held in the system: 2 locks held by kworker/0:1/16: #0: ("cifsiod"){.+.+.+}, at: process_one_work+0x255/0x8e0 #1: ((&cfile->oplock_break)){+.+.+.}, at: process_one_work+0x255/0x8e0 Showing busy workqueues and worker pools: workqueue cifsiod: flags=0xc pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=1/1 in-flight: 16:cifs_oplock_break delayed: cifs_writev_complete, cifs_echo_request pool 0: cpus=0 node=0 flags=0x0 nice=0 hung=0s workers=3 idle: 750 3 Fix these problems by creating a a new workqueue (with a rescuer) for the oplock break work. Signed-off-by: Rabin Vincent <rabinv@axis.com> Signed-off-by: Steve French <smfrench@gmail.com> CC: Stable <stable@vger.kernel.org>
2017-05-03 15:54:01 +00:00
cifsoplockd_wq = alloc_workqueue("cifsoplockd",
WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
if (!cifsoplockd_wq) {
rc = -ENOMEM;
cifs: move cifsFileInfo_put logic into a work-queue This patch moves the final part of the cifsFileInfo_put() logic where we need a write lock on lock_sem to be processed in a separate thread that holds no other locks. This is to prevent deadlocks like the one below: > there are 6 processes looping to while trying to down_write > cinode->lock_sem, 5 of them from _cifsFileInfo_put, and one from > cifs_new_fileinfo > > and there are 5 other processes which are blocked, several of them > waiting on either PG_writeback or PG_locked (which are both set), all > for the same page of the file > > 2 inode_lock() (inode->i_rwsem) for the file > 1 wait_on_page_writeback() for the page > 1 down_read(inode->i_rwsem) for the inode of the directory > 1 inode_lock()(inode->i_rwsem) for the inode of the directory > 1 __lock_page > > > so processes are blocked waiting on: > page flags PG_locked and PG_writeback for one specific page > inode->i_rwsem for the directory > inode->i_rwsem for the file > cifsInodeInflock_sem > > > > here are the more gory details (let me know if I need to provide > anything more/better): > > [0 00:48:22.765] [UN] PID: 8863 TASK: ffff8c691547c5c0 CPU: 3 > COMMAND: "reopen_file" > #0 [ffff9965007e3ba8] __schedule at ffffffff9b6e6095 > #1 [ffff9965007e3c38] schedule at ffffffff9b6e64df > #2 [ffff9965007e3c48] rwsem_down_write_slowpath at ffffffff9af283d7 > #3 [ffff9965007e3cb8] legitimize_path at ffffffff9b0f975d > #4 [ffff9965007e3d08] path_openat at ffffffff9b0fe55d > #5 [ffff9965007e3dd8] do_filp_open at ffffffff9b100a33 > #6 [ffff9965007e3ee0] do_sys_open at ffffffff9b0eb2d6 > #7 [ffff9965007e3f38] do_syscall_64 at ffffffff9ae04315 > * (I think legitimize_path is bogus) > > in path_openat > } else { > const char *s = path_init(nd, flags); > while (!(error = link_path_walk(s, nd)) && > (error = do_last(nd, file, op)) > 0) { <<<< > > do_last: > if (open_flag & O_CREAT) > inode_lock(dir->d_inode); <<<< > else > so it's trying to take inode->i_rwsem for the directory > > DENTRY INODE SUPERBLK TYPE PATH > ffff8c68bb8e79c0 ffff8c691158ef20 ffff8c6915bf9000 DIR /mnt/vm1_smb/ > inode.i_rwsem is ffff8c691158efc0 > > <struct rw_semaphore 0xffff8c691158efc0>: > owner: <struct task_struct 0xffff8c6914275d00> (UN - 8856 - > reopen_file), counter: 0x0000000000000003 > waitlist: 2 > 0xffff9965007e3c90 8863 reopen_file UN 0 1:29:22.926 > RWSEM_WAITING_FOR_WRITE > 0xffff996500393e00 9802 ls UN 0 1:17:26.700 > RWSEM_WAITING_FOR_READ > > > the owner of the inode.i_rwsem of the directory is: > > [0 00:00:00.109] [UN] PID: 8856 TASK: ffff8c6914275d00 CPU: 3 > COMMAND: "reopen_file" > #0 [ffff99650065b828] __schedule at ffffffff9b6e6095 > #1 [ffff99650065b8b8] schedule at ffffffff9b6e64df > #2 [ffff99650065b8c8] schedule_timeout at ffffffff9b6e9f89 > #3 [ffff99650065b940] msleep at ffffffff9af573a9 > #4 [ffff99650065b948] _cifsFileInfo_put.cold.63 at ffffffffc0a42dd6 [cifs] > #5 [ffff99650065ba38] cifs_writepage_locked at ffffffffc0a0b8f3 [cifs] > #6 [ffff99650065bab0] cifs_launder_page at ffffffffc0a0bb72 [cifs] > #7 [ffff99650065bb30] invalidate_inode_pages2_range at ffffffff9b04d4bd > #8 [ffff99650065bcb8] cifs_invalidate_mapping at ffffffffc0a11339 [cifs] > #9 [ffff99650065bcd0] cifs_revalidate_mapping at ffffffffc0a1139a [cifs] > #10 [ffff99650065bcf0] cifs_d_revalidate at ffffffffc0a014f6 [cifs] > #11 [ffff99650065bd08] path_openat at ffffffff9b0fe7f7 > #12 [ffff99650065bdd8] do_filp_open at ffffffff9b100a33 > #13 [ffff99650065bee0] do_sys_open at ffffffff9b0eb2d6 > #14 [ffff99650065bf38] do_syscall_64 at ffffffff9ae04315 > > cifs_launder_page is for page 0xffffd1e2c07d2480 > > crash> page.index,mapping,flags 0xffffd1e2c07d2480 > index = 0x8 > mapping = 0xffff8c68f3cd0db0 > flags = 0xfffffc0008095 > > PAGE-FLAG BIT VALUE > PG_locked 0 0000001 > PG_uptodate 2 0000004 > PG_lru 4 0000010 > PG_waiters 7 0000080 > PG_writeback 15 0008000 > > > inode is ffff8c68f3cd0c40 > inode.i_rwsem is ffff8c68f3cd0ce0 > DENTRY INODE SUPERBLK TYPE PATH > ffff8c68a1f1b480 ffff8c68f3cd0c40 ffff8c6915bf9000 REG > /mnt/vm1_smb/testfile.8853 > > > this process holds the inode->i_rwsem for the parent directory, is > laundering a page attached to the inode of the file it's opening, and in > _cifsFileInfo_put is trying to down_write the cifsInodeInflock_sem > for the file itself. > > > <struct rw_semaphore 0xffff8c68f3cd0ce0>: > owner: <struct task_struct 0xffff8c6914272e80> (UN - 8854 - > reopen_file), counter: 0x0000000000000003 > waitlist: 1 > 0xffff9965005dfd80 8855 reopen_file UN 0 1:29:22.912 > RWSEM_WAITING_FOR_WRITE > > this is the inode.i_rwsem for the file > > the owner: > > [0 00:48:22.739] [UN] PID: 8854 TASK: ffff8c6914272e80 CPU: 2 > COMMAND: "reopen_file" > #0 [ffff99650054fb38] __schedule at ffffffff9b6e6095 > #1 [ffff99650054fbc8] schedule at ffffffff9b6e64df > #2 [ffff99650054fbd8] io_schedule at ffffffff9b6e68e2 > #3 [ffff99650054fbe8] __lock_page at ffffffff9b03c56f > #4 [ffff99650054fc80] pagecache_get_page at ffffffff9b03dcdf > #5 [ffff99650054fcc0] grab_cache_page_write_begin at ffffffff9b03ef4c > #6 [ffff99650054fcd0] cifs_write_begin at ffffffffc0a064ec [cifs] > #7 [ffff99650054fd30] generic_perform_write at ffffffff9b03bba4 > #8 [ffff99650054fda8] __generic_file_write_iter at ffffffff9b04060a > #9 [ffff99650054fdf0] cifs_strict_writev.cold.70 at ffffffffc0a4469b [cifs] > #10 [ffff99650054fe48] new_sync_write at ffffffff9b0ec1dd > #11 [ffff99650054fed0] vfs_write at ffffffff9b0eed35 > #12 [ffff99650054ff00] ksys_write at ffffffff9b0eefd9 > #13 [ffff99650054ff38] do_syscall_64 at ffffffff9ae04315 > > the process holds the inode->i_rwsem for the file to which it's writing, > and is trying to __lock_page for the same page as in the other processes > > > the other tasks: > [0 00:00:00.028] [UN] PID: 8859 TASK: ffff8c6915479740 CPU: 2 > COMMAND: "reopen_file" > #0 [ffff9965007b39d8] __schedule at ffffffff9b6e6095 > #1 [ffff9965007b3a68] schedule at ffffffff9b6e64df > #2 [ffff9965007b3a78] schedule_timeout at ffffffff9b6e9f89 > #3 [ffff9965007b3af0] msleep at ffffffff9af573a9 > #4 [ffff9965007b3af8] cifs_new_fileinfo.cold.61 at ffffffffc0a42a07 [cifs] > #5 [ffff9965007b3b78] cifs_open at ffffffffc0a0709d [cifs] > #6 [ffff9965007b3cd8] do_dentry_open at ffffffff9b0e9b7a > #7 [ffff9965007b3d08] path_openat at ffffffff9b0fe34f > #8 [ffff9965007b3dd8] do_filp_open at ffffffff9b100a33 > #9 [ffff9965007b3ee0] do_sys_open at ffffffff9b0eb2d6 > #10 [ffff9965007b3f38] do_syscall_64 at ffffffff9ae04315 > > this is opening the file, and is trying to down_write cinode->lock_sem > > > [0 00:00:00.041] [UN] PID: 8860 TASK: ffff8c691547ae80 CPU: 2 > COMMAND: "reopen_file" > [0 00:00:00.057] [UN] PID: 8861 TASK: ffff8c6915478000 CPU: 3 > COMMAND: "reopen_file" > [0 00:00:00.059] [UN] PID: 8858 TASK: ffff8c6914271740 CPU: 2 > COMMAND: "reopen_file" > [0 00:00:00.109] [UN] PID: 8862 TASK: ffff8c691547dd00 CPU: 6 > COMMAND: "reopen_file" > #0 [ffff9965007c3c78] __schedule at ffffffff9b6e6095 > #1 [ffff9965007c3d08] schedule at ffffffff9b6e64df > #2 [ffff9965007c3d18] schedule_timeout at ffffffff9b6e9f89 > #3 [ffff9965007c3d90] msleep at ffffffff9af573a9 > #4 [ffff9965007c3d98] _cifsFileInfo_put.cold.63 at ffffffffc0a42dd6 [cifs] > #5 [ffff9965007c3e88] cifs_close at ffffffffc0a07aaf [cifs] > #6 [ffff9965007c3ea0] __fput at ffffffff9b0efa6e > #7 [ffff9965007c3ee8] task_work_run at ffffffff9aef1614 > #8 [ffff9965007c3f20] exit_to_usermode_loop at ffffffff9ae03d6f > #9 [ffff9965007c3f38] do_syscall_64 at ffffffff9ae0444c > > closing the file, and trying to down_write cifsi->lock_sem > > > [0 00:48:22.839] [UN] PID: 8857 TASK: ffff8c6914270000 CPU: 7 > COMMAND: "reopen_file" > #0 [ffff9965006a7cc8] __schedule at ffffffff9b6e6095 > #1 [ffff9965006a7d58] schedule at ffffffff9b6e64df > #2 [ffff9965006a7d68] io_schedule at ffffffff9b6e68e2 > #3 [ffff9965006a7d78] wait_on_page_bit at ffffffff9b03cac6 > #4 [ffff9965006a7e10] __filemap_fdatawait_range at ffffffff9b03b028 > #5 [ffff9965006a7ed8] filemap_write_and_wait at ffffffff9b040165 > #6 [ffff9965006a7ef0] cifs_flush at ffffffffc0a0c2fa [cifs] > #7 [ffff9965006a7f10] filp_close at ffffffff9b0e93f1 > #8 [ffff9965006a7f30] __x64_sys_close at ffffffff9b0e9a0e > #9 [ffff9965006a7f38] do_syscall_64 at ffffffff9ae04315 > > in __filemap_fdatawait_range > wait_on_page_writeback(page); > for the same page of the file > > > > [0 00:48:22.718] [UN] PID: 8855 TASK: ffff8c69142745c0 CPU: 7 > COMMAND: "reopen_file" > #0 [ffff9965005dfc98] __schedule at ffffffff9b6e6095 > #1 [ffff9965005dfd28] schedule at ffffffff9b6e64df > #2 [ffff9965005dfd38] rwsem_down_write_slowpath at ffffffff9af283d7 > #3 [ffff9965005dfdf0] cifs_strict_writev at ffffffffc0a0c40a [cifs] > #4 [ffff9965005dfe48] new_sync_write at ffffffff9b0ec1dd > #5 [ffff9965005dfed0] vfs_write at ffffffff9b0eed35 > #6 [ffff9965005dff00] ksys_write at ffffffff9b0eefd9 > #7 [ffff9965005dff38] do_syscall_64 at ffffffff9ae04315 > > inode_lock(inode); > > > and one 'ls' later on, to see whether the rest of the mount is available > (the test file is in the root, so we get blocked up on the directory > ->i_rwsem), so the entire mount is unavailable > > [0 00:36:26.473] [UN] PID: 9802 TASK: ffff8c691436ae80 CPU: 4 > COMMAND: "ls" > #0 [ffff996500393d28] __schedule at ffffffff9b6e6095 > #1 [ffff996500393db8] schedule at ffffffff9b6e64df > #2 [ffff996500393dc8] rwsem_down_read_slowpath at ffffffff9b6e9421 > #3 [ffff996500393e78] down_read_killable at ffffffff9b6e95e2 > #4 [ffff996500393e88] iterate_dir at ffffffff9b103c56 > #5 [ffff996500393ec8] ksys_getdents64 at ffffffff9b104b0c > #6 [ffff996500393f30] __x64_sys_getdents64 at ffffffff9b104bb6 > #7 [ffff996500393f38] do_syscall_64 at ffffffff9ae04315 > > in iterate_dir: > if (shared) > res = down_read_killable(&inode->i_rwsem); <<<< > else > res = down_write_killable(&inode->i_rwsem); > Reported-by: Frank Sorenson <sorenson@redhat.com> Reviewed-by: Pavel Shilovsky <pshilov@microsoft.com> Signed-off-by: Ronnie Sahlberg <lsahlber@redhat.com> Signed-off-by: Steve French <stfrench@microsoft.com>
2019-11-03 03:06:37 +00:00
goto out_destroy_fileinfo_put_wq;
CIFS: fix oplock break deadlocks When the final cifsFileInfo_put() is called from cifsiod and an oplock break work is queued, lockdep complains loudly: ============================================= [ INFO: possible recursive locking detected ] 4.11.0+ #21 Not tainted --------------------------------------------- kworker/0:2/78 is trying to acquire lock: ("cifsiod"){++++.+}, at: flush_work+0x215/0x350 but task is already holding lock: ("cifsiod"){++++.+}, at: process_one_work+0x255/0x8e0 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock("cifsiod"); lock("cifsiod"); *** DEADLOCK *** May be due to missing lock nesting notation 2 locks held by kworker/0:2/78: #0: ("cifsiod"){++++.+}, at: process_one_work+0x255/0x8e0 #1: ((&wdata->work)){+.+...}, at: process_one_work+0x255/0x8e0 stack backtrace: CPU: 0 PID: 78 Comm: kworker/0:2 Not tainted 4.11.0+ #21 Workqueue: cifsiod cifs_writev_complete Call Trace: dump_stack+0x85/0xc2 __lock_acquire+0x17dd/0x2260 ? match_held_lock+0x20/0x2b0 ? trace_hardirqs_off_caller+0x86/0x130 ? mark_lock+0xa6/0x920 lock_acquire+0xcc/0x260 ? lock_acquire+0xcc/0x260 ? flush_work+0x215/0x350 flush_work+0x236/0x350 ? flush_work+0x215/0x350 ? destroy_worker+0x170/0x170 __cancel_work_timer+0x17d/0x210 ? ___preempt_schedule+0x16/0x18 cancel_work_sync+0x10/0x20 cifsFileInfo_put+0x338/0x7f0 cifs_writedata_release+0x2a/0x40 ? cifs_writedata_release+0x2a/0x40 cifs_writev_complete+0x29d/0x850 ? preempt_count_sub+0x18/0xd0 process_one_work+0x304/0x8e0 worker_thread+0x9b/0x6a0 kthread+0x1b2/0x200 ? process_one_work+0x8e0/0x8e0 ? kthread_create_on_node+0x40/0x40 ret_from_fork+0x31/0x40 This is a real warning. Since the oplock is queued on the same workqueue this can deadlock if there is only one worker thread active for the workqueue (which will be the case during memory pressure when the rescuer thread is handling it). Furthermore, there is at least one other kind of hang possible due to the oplock break handling if there is only worker. (This can be reproduced without introducing memory pressure by having passing 1 for the max_active parameter of cifsiod.) cifs_oplock_break() can wait indefintely in the filemap_fdatawait() while the cifs_writev_complete() work is blocked: sysrq: SysRq : Show Blocked State task PC stack pid father kworker/0:1 D 0 16 2 0x00000000 Workqueue: cifsiod cifs_oplock_break Call Trace: __schedule+0x562/0xf40 ? mark_held_locks+0x4a/0xb0 schedule+0x57/0xe0 io_schedule+0x21/0x50 wait_on_page_bit+0x143/0x190 ? add_to_page_cache_lru+0x150/0x150 __filemap_fdatawait_range+0x134/0x190 ? do_writepages+0x51/0x70 filemap_fdatawait_range+0x14/0x30 filemap_fdatawait+0x3b/0x40 cifs_oplock_break+0x651/0x710 ? preempt_count_sub+0x18/0xd0 process_one_work+0x304/0x8e0 worker_thread+0x9b/0x6a0 kthread+0x1b2/0x200 ? process_one_work+0x8e0/0x8e0 ? kthread_create_on_node+0x40/0x40 ret_from_fork+0x31/0x40 dd D 0 683 171 0x00000000 Call Trace: __schedule+0x562/0xf40 ? mark_held_locks+0x29/0xb0 schedule+0x57/0xe0 io_schedule+0x21/0x50 wait_on_page_bit+0x143/0x190 ? add_to_page_cache_lru+0x150/0x150 __filemap_fdatawait_range+0x134/0x190 ? do_writepages+0x51/0x70 filemap_fdatawait_range+0x14/0x30 filemap_fdatawait+0x3b/0x40 filemap_write_and_wait+0x4e/0x70 cifs_flush+0x6a/0xb0 filp_close+0x52/0xa0 __close_fd+0xdc/0x150 SyS_close+0x33/0x60 entry_SYSCALL_64_fastpath+0x1f/0xbe Showing all locks held in the system: 2 locks held by kworker/0:1/16: #0: ("cifsiod"){.+.+.+}, at: process_one_work+0x255/0x8e0 #1: ((&cfile->oplock_break)){+.+.+.}, at: process_one_work+0x255/0x8e0 Showing busy workqueues and worker pools: workqueue cifsiod: flags=0xc pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=1/1 in-flight: 16:cifs_oplock_break delayed: cifs_writev_complete, cifs_echo_request pool 0: cpus=0 node=0 flags=0x0 nice=0 hung=0s workers=3 idle: 750 3 Fix these problems by creating a a new workqueue (with a rescuer) for the oplock break work. Signed-off-by: Rabin Vincent <rabinv@axis.com> Signed-off-by: Steve French <smfrench@gmail.com> CC: Stable <stable@vger.kernel.org>
2017-05-03 15:54:01 +00:00
}
deferredclose_wq = alloc_workqueue("deferredclose",
WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
if (!deferredclose_wq) {
rc = -ENOMEM;
goto out_destroy_cifsoplockd_wq;
}
rc = cifs_init_inodecache();
if (rc)
cifs: Support fscache indexing rewrite Change the cifs filesystem to take account of the changes to fscache's indexing rewrite and reenable caching in cifs. The following changes have been made: (1) The fscache_netfs struct is no more, and there's no need to register the filesystem as a whole. (2) The session cookie is now an fscache_volume cookie, allocated with fscache_acquire_volume(). That takes three parameters: a string representing the "volume" in the index, a string naming the cache to use (or NULL) and a u64 that conveys coherency metadata for the volume. For cifs, I've made it render the volume name string as: "cifs,<ipaddress>,<sharename>" where the sharename has '/' characters replaced with ';'. This probably needs rethinking a bit as the total name could exceed the maximum filename component length. Further, the coherency data is currently just set to 0. It needs something else doing with it - I wonder if it would suffice simply to sum the resource_id, vol_create_time and vol_serial_number or maybe hash them. (3) The fscache_cookie_def is no more and needed information is passed directly to fscache_acquire_cookie(). The cache no longer calls back into the filesystem, but rather metadata changes are indicated at other times. fscache_acquire_cookie() is passed the same keying and coherency information as before. (4) The functions to set/reset cookies are removed and fscache_use_cookie() and fscache_unuse_cookie() are used instead. fscache_use_cookie() is passed a flag to indicate if the cookie is opened for writing. fscache_unuse_cookie() is passed updates for the metadata if we changed it (ie. if the file was opened for writing). These are called when the file is opened or closed. (5) cifs_setattr_*() are made to call fscache_resize() to change the size of the cache object. (6) The functions to read and write data are stubbed out pending a conversion to use netfslib. Changes ======= ver #8: - Abstract cache invalidation into a helper function. - Fix some checkpatch warnings[3]. ver #7: - Removed the accidentally added-back call to get the super cookie in cifs_root_iget(). - Fixed the right call to cifs_fscache_get_super_cookie() to take account of the "-o fsc" mount flag. ver #6: - Moved the change of gfpflags_allow_blocking() to current_is_kswapd() for cifs here. - Fixed one of the error paths in cifs_atomic_open() to jump around the call to use the cookie. - Fixed an additional successful return in the middle of cifs_open() to use the cookie on the way out. - Only get a volume cookie (and thus inode cookies) when "-o fsc" is supplied to mount. ver #5: - Fixed a couple of bits of cookie handling[2]: - The cookie should be released in cifs_evict_inode(), not cifsFileInfo_put_final(). The cookie needs to persist beyond file closure so that writepages will be able to write to it. - fscache_use_cookie() needs to be called in cifs_atomic_open() as it is for cifs_open(). ver #4: - Fixed the use of sizeof with memset. - tcon->vol_create_time is __le64 so doesn't need cpu_to_le64(). ver #3: - Canonicalise the cifs coherency data to make the cache portable. - Set volume coherency data. ver #2: - Use gfpflags_allow_blocking() rather than using flag directly. - Upgraded to -rc4 to allow for upstream changes[1]. - fscache_acquire_volume() now returns errors. Signed-off-by: David Howells <dhowells@redhat.com> Acked-by: Jeff Layton <jlayton@kernel.org> cc: Steve French <smfrench@gmail.com> cc: Shyam Prasad N <nspmangalore@gmail.com> cc: linux-cifs@vger.kernel.org cc: linux-cachefs@redhat.com Link: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=23b55d673d7527b093cd97b7c217c82e70cd1af0 [1] Link: https://lore.kernel.org/r/3419813.1641592362@warthog.procyon.org.uk/ [2] Link: https://lore.kernel.org/r/CAH2r5muTanw9pJqzAHd01d9A8keeChkzGsCEH6=0rHutVLAF-A@mail.gmail.com/ [3] Link: https://lore.kernel.org/r/163819671009.215744.11230627184193298714.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/163906982979.143852.10672081929614953210.stgit@warthog.procyon.org.uk/ # v2 Link: https://lore.kernel.org/r/163967187187.1823006.247415138444991444.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/164021579335.640689.2681324337038770579.stgit@warthog.procyon.org.uk/ # v4 Link: https://lore.kernel.org/r/3462849.1641593783@warthog.procyon.org.uk/ # v5 Link: https://lore.kernel.org/r/1318953.1642024578@warthog.procyon.org.uk/ # v6 Signed-off-by: Steve French <stfrench@microsoft.com>
2020-11-17 15:56:59 +00:00
goto out_destroy_deferredclose_wq;
rc = init_mids();
if (rc)
goto out_destroy_inodecache;
rc = cifs_init_request_bufs();
if (rc)
goto out_destroy_mids;
#ifdef CONFIG_CIFS_DFS_UPCALL
rc = dfs_cache_init();
if (rc)
goto out_destroy_request_bufs;
#endif /* CONFIG_CIFS_DFS_UPCALL */
#ifdef CONFIG_CIFS_UPCALL
rc = init_cifs_spnego();
if (rc)
goto out_destroy_dfs_cache;
#endif /* CONFIG_CIFS_UPCALL */
#ifdef CONFIG_CIFS_SWN_UPCALL
rc = cifs_genl_init();
if (rc)
goto out_register_key_type;
#endif /* CONFIG_CIFS_SWN_UPCALL */
rc = init_cifs_idmap();
if (rc)
goto out_cifs_swn_init;
rc = register_filesystem(&cifs_fs_type);
if (rc)
goto out_init_cifs_idmap;
rc = register_filesystem(&smb3_fs_type);
if (rc) {
unregister_filesystem(&cifs_fs_type);
goto out_init_cifs_idmap;
}
return 0;
out_init_cifs_idmap:
exit_cifs_idmap();
out_cifs_swn_init:
#ifdef CONFIG_CIFS_SWN_UPCALL
cifs_genl_exit();
out_register_key_type:
#endif
#ifdef CONFIG_CIFS_UPCALL
exit_cifs_spnego();
out_destroy_dfs_cache:
#endif
#ifdef CONFIG_CIFS_DFS_UPCALL
dfs_cache_destroy();
out_destroy_request_bufs:
#endif
cifs_destroy_request_bufs();
out_destroy_mids:
destroy_mids();
out_destroy_inodecache:
cifs_destroy_inodecache();
out_destroy_deferredclose_wq:
destroy_workqueue(deferredclose_wq);
CIFS: fix oplock break deadlocks When the final cifsFileInfo_put() is called from cifsiod and an oplock break work is queued, lockdep complains loudly: ============================================= [ INFO: possible recursive locking detected ] 4.11.0+ #21 Not tainted --------------------------------------------- kworker/0:2/78 is trying to acquire lock: ("cifsiod"){++++.+}, at: flush_work+0x215/0x350 but task is already holding lock: ("cifsiod"){++++.+}, at: process_one_work+0x255/0x8e0 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock("cifsiod"); lock("cifsiod"); *** DEADLOCK *** May be due to missing lock nesting notation 2 locks held by kworker/0:2/78: #0: ("cifsiod"){++++.+}, at: process_one_work+0x255/0x8e0 #1: ((&wdata->work)){+.+...}, at: process_one_work+0x255/0x8e0 stack backtrace: CPU: 0 PID: 78 Comm: kworker/0:2 Not tainted 4.11.0+ #21 Workqueue: cifsiod cifs_writev_complete Call Trace: dump_stack+0x85/0xc2 __lock_acquire+0x17dd/0x2260 ? match_held_lock+0x20/0x2b0 ? trace_hardirqs_off_caller+0x86/0x130 ? mark_lock+0xa6/0x920 lock_acquire+0xcc/0x260 ? lock_acquire+0xcc/0x260 ? flush_work+0x215/0x350 flush_work+0x236/0x350 ? flush_work+0x215/0x350 ? destroy_worker+0x170/0x170 __cancel_work_timer+0x17d/0x210 ? ___preempt_schedule+0x16/0x18 cancel_work_sync+0x10/0x20 cifsFileInfo_put+0x338/0x7f0 cifs_writedata_release+0x2a/0x40 ? cifs_writedata_release+0x2a/0x40 cifs_writev_complete+0x29d/0x850 ? preempt_count_sub+0x18/0xd0 process_one_work+0x304/0x8e0 worker_thread+0x9b/0x6a0 kthread+0x1b2/0x200 ? process_one_work+0x8e0/0x8e0 ? kthread_create_on_node+0x40/0x40 ret_from_fork+0x31/0x40 This is a real warning. Since the oplock is queued on the same workqueue this can deadlock if there is only one worker thread active for the workqueue (which will be the case during memory pressure when the rescuer thread is handling it). Furthermore, there is at least one other kind of hang possible due to the oplock break handling if there is only worker. (This can be reproduced without introducing memory pressure by having passing 1 for the max_active parameter of cifsiod.) cifs_oplock_break() can wait indefintely in the filemap_fdatawait() while the cifs_writev_complete() work is blocked: sysrq: SysRq : Show Blocked State task PC stack pid father kworker/0:1 D 0 16 2 0x00000000 Workqueue: cifsiod cifs_oplock_break Call Trace: __schedule+0x562/0xf40 ? mark_held_locks+0x4a/0xb0 schedule+0x57/0xe0 io_schedule+0x21/0x50 wait_on_page_bit+0x143/0x190 ? add_to_page_cache_lru+0x150/0x150 __filemap_fdatawait_range+0x134/0x190 ? do_writepages+0x51/0x70 filemap_fdatawait_range+0x14/0x30 filemap_fdatawait+0x3b/0x40 cifs_oplock_break+0x651/0x710 ? preempt_count_sub+0x18/0xd0 process_one_work+0x304/0x8e0 worker_thread+0x9b/0x6a0 kthread+0x1b2/0x200 ? process_one_work+0x8e0/0x8e0 ? kthread_create_on_node+0x40/0x40 ret_from_fork+0x31/0x40 dd D 0 683 171 0x00000000 Call Trace: __schedule+0x562/0xf40 ? mark_held_locks+0x29/0xb0 schedule+0x57/0xe0 io_schedule+0x21/0x50 wait_on_page_bit+0x143/0x190 ? add_to_page_cache_lru+0x150/0x150 __filemap_fdatawait_range+0x134/0x190 ? do_writepages+0x51/0x70 filemap_fdatawait_range+0x14/0x30 filemap_fdatawait+0x3b/0x40 filemap_write_and_wait+0x4e/0x70 cifs_flush+0x6a/0xb0 filp_close+0x52/0xa0 __close_fd+0xdc/0x150 SyS_close+0x33/0x60 entry_SYSCALL_64_fastpath+0x1f/0xbe Showing all locks held in the system: 2 locks held by kworker/0:1/16: #0: ("cifsiod"){.+.+.+}, at: process_one_work+0x255/0x8e0 #1: ((&cfile->oplock_break)){+.+.+.}, at: process_one_work+0x255/0x8e0 Showing busy workqueues and worker pools: workqueue cifsiod: flags=0xc pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=1/1 in-flight: 16:cifs_oplock_break delayed: cifs_writev_complete, cifs_echo_request pool 0: cpus=0 node=0 flags=0x0 nice=0 hung=0s workers=3 idle: 750 3 Fix these problems by creating a a new workqueue (with a rescuer) for the oplock break work. Signed-off-by: Rabin Vincent <rabinv@axis.com> Signed-off-by: Steve French <smfrench@gmail.com> CC: Stable <stable@vger.kernel.org>
2017-05-03 15:54:01 +00:00
out_destroy_cifsoplockd_wq:
destroy_workqueue(cifsoplockd_wq);
cifs: move cifsFileInfo_put logic into a work-queue This patch moves the final part of the cifsFileInfo_put() logic where we need a write lock on lock_sem to be processed in a separate thread that holds no other locks. This is to prevent deadlocks like the one below: > there are 6 processes looping to while trying to down_write > cinode->lock_sem, 5 of them from _cifsFileInfo_put, and one from > cifs_new_fileinfo > > and there are 5 other processes which are blocked, several of them > waiting on either PG_writeback or PG_locked (which are both set), all > for the same page of the file > > 2 inode_lock() (inode->i_rwsem) for the file > 1 wait_on_page_writeback() for the page > 1 down_read(inode->i_rwsem) for the inode of the directory > 1 inode_lock()(inode->i_rwsem) for the inode of the directory > 1 __lock_page > > > so processes are blocked waiting on: > page flags PG_locked and PG_writeback for one specific page > inode->i_rwsem for the directory > inode->i_rwsem for the file > cifsInodeInflock_sem > > > > here are the more gory details (let me know if I need to provide > anything more/better): > > [0 00:48:22.765] [UN] PID: 8863 TASK: ffff8c691547c5c0 CPU: 3 > COMMAND: "reopen_file" > #0 [ffff9965007e3ba8] __schedule at ffffffff9b6e6095 > #1 [ffff9965007e3c38] schedule at ffffffff9b6e64df > #2 [ffff9965007e3c48] rwsem_down_write_slowpath at ffffffff9af283d7 > #3 [ffff9965007e3cb8] legitimize_path at ffffffff9b0f975d > #4 [ffff9965007e3d08] path_openat at ffffffff9b0fe55d > #5 [ffff9965007e3dd8] do_filp_open at ffffffff9b100a33 > #6 [ffff9965007e3ee0] do_sys_open at ffffffff9b0eb2d6 > #7 [ffff9965007e3f38] do_syscall_64 at ffffffff9ae04315 > * (I think legitimize_path is bogus) > > in path_openat > } else { > const char *s = path_init(nd, flags); > while (!(error = link_path_walk(s, nd)) && > (error = do_last(nd, file, op)) > 0) { <<<< > > do_last: > if (open_flag & O_CREAT) > inode_lock(dir->d_inode); <<<< > else > so it's trying to take inode->i_rwsem for the directory > > DENTRY INODE SUPERBLK TYPE PATH > ffff8c68bb8e79c0 ffff8c691158ef20 ffff8c6915bf9000 DIR /mnt/vm1_smb/ > inode.i_rwsem is ffff8c691158efc0 > > <struct rw_semaphore 0xffff8c691158efc0>: > owner: <struct task_struct 0xffff8c6914275d00> (UN - 8856 - > reopen_file), counter: 0x0000000000000003 > waitlist: 2 > 0xffff9965007e3c90 8863 reopen_file UN 0 1:29:22.926 > RWSEM_WAITING_FOR_WRITE > 0xffff996500393e00 9802 ls UN 0 1:17:26.700 > RWSEM_WAITING_FOR_READ > > > the owner of the inode.i_rwsem of the directory is: > > [0 00:00:00.109] [UN] PID: 8856 TASK: ffff8c6914275d00 CPU: 3 > COMMAND: "reopen_file" > #0 [ffff99650065b828] __schedule at ffffffff9b6e6095 > #1 [ffff99650065b8b8] schedule at ffffffff9b6e64df > #2 [ffff99650065b8c8] schedule_timeout at ffffffff9b6e9f89 > #3 [ffff99650065b940] msleep at ffffffff9af573a9 > #4 [ffff99650065b948] _cifsFileInfo_put.cold.63 at ffffffffc0a42dd6 [cifs] > #5 [ffff99650065ba38] cifs_writepage_locked at ffffffffc0a0b8f3 [cifs] > #6 [ffff99650065bab0] cifs_launder_page at ffffffffc0a0bb72 [cifs] > #7 [ffff99650065bb30] invalidate_inode_pages2_range at ffffffff9b04d4bd > #8 [ffff99650065bcb8] cifs_invalidate_mapping at ffffffffc0a11339 [cifs] > #9 [ffff99650065bcd0] cifs_revalidate_mapping at ffffffffc0a1139a [cifs] > #10 [ffff99650065bcf0] cifs_d_revalidate at ffffffffc0a014f6 [cifs] > #11 [ffff99650065bd08] path_openat at ffffffff9b0fe7f7 > #12 [ffff99650065bdd8] do_filp_open at ffffffff9b100a33 > #13 [ffff99650065bee0] do_sys_open at ffffffff9b0eb2d6 > #14 [ffff99650065bf38] do_syscall_64 at ffffffff9ae04315 > > cifs_launder_page is for page 0xffffd1e2c07d2480 > > crash> page.index,mapping,flags 0xffffd1e2c07d2480 > index = 0x8 > mapping = 0xffff8c68f3cd0db0 > flags = 0xfffffc0008095 > > PAGE-FLAG BIT VALUE > PG_locked 0 0000001 > PG_uptodate 2 0000004 > PG_lru 4 0000010 > PG_waiters 7 0000080 > PG_writeback 15 0008000 > > > inode is ffff8c68f3cd0c40 > inode.i_rwsem is ffff8c68f3cd0ce0 > DENTRY INODE SUPERBLK TYPE PATH > ffff8c68a1f1b480 ffff8c68f3cd0c40 ffff8c6915bf9000 REG > /mnt/vm1_smb/testfile.8853 > > > this process holds the inode->i_rwsem for the parent directory, is > laundering a page attached to the inode of the file it's opening, and in > _cifsFileInfo_put is trying to down_write the cifsInodeInflock_sem > for the file itself. > > > <struct rw_semaphore 0xffff8c68f3cd0ce0>: > owner: <struct task_struct 0xffff8c6914272e80> (UN - 8854 - > reopen_file), counter: 0x0000000000000003 > waitlist: 1 > 0xffff9965005dfd80 8855 reopen_file UN 0 1:29:22.912 > RWSEM_WAITING_FOR_WRITE > > this is the inode.i_rwsem for the file > > the owner: > > [0 00:48:22.739] [UN] PID: 8854 TASK: ffff8c6914272e80 CPU: 2 > COMMAND: "reopen_file" > #0 [ffff99650054fb38] __schedule at ffffffff9b6e6095 > #1 [ffff99650054fbc8] schedule at ffffffff9b6e64df > #2 [ffff99650054fbd8] io_schedule at ffffffff9b6e68e2 > #3 [ffff99650054fbe8] __lock_page at ffffffff9b03c56f > #4 [ffff99650054fc80] pagecache_get_page at ffffffff9b03dcdf > #5 [ffff99650054fcc0] grab_cache_page_write_begin at ffffffff9b03ef4c > #6 [ffff99650054fcd0] cifs_write_begin at ffffffffc0a064ec [cifs] > #7 [ffff99650054fd30] generic_perform_write at ffffffff9b03bba4 > #8 [ffff99650054fda8] __generic_file_write_iter at ffffffff9b04060a > #9 [ffff99650054fdf0] cifs_strict_writev.cold.70 at ffffffffc0a4469b [cifs] > #10 [ffff99650054fe48] new_sync_write at ffffffff9b0ec1dd > #11 [ffff99650054fed0] vfs_write at ffffffff9b0eed35 > #12 [ffff99650054ff00] ksys_write at ffffffff9b0eefd9 > #13 [ffff99650054ff38] do_syscall_64 at ffffffff9ae04315 > > the process holds the inode->i_rwsem for the file to which it's writing, > and is trying to __lock_page for the same page as in the other processes > > > the other tasks: > [0 00:00:00.028] [UN] PID: 8859 TASK: ffff8c6915479740 CPU: 2 > COMMAND: "reopen_file" > #0 [ffff9965007b39d8] __schedule at ffffffff9b6e6095 > #1 [ffff9965007b3a68] schedule at ffffffff9b6e64df > #2 [ffff9965007b3a78] schedule_timeout at ffffffff9b6e9f89 > #3 [ffff9965007b3af0] msleep at ffffffff9af573a9 > #4 [ffff9965007b3af8] cifs_new_fileinfo.cold.61 at ffffffffc0a42a07 [cifs] > #5 [ffff9965007b3b78] cifs_open at ffffffffc0a0709d [cifs] > #6 [ffff9965007b3cd8] do_dentry_open at ffffffff9b0e9b7a > #7 [ffff9965007b3d08] path_openat at ffffffff9b0fe34f > #8 [ffff9965007b3dd8] do_filp_open at ffffffff9b100a33 > #9 [ffff9965007b3ee0] do_sys_open at ffffffff9b0eb2d6 > #10 [ffff9965007b3f38] do_syscall_64 at ffffffff9ae04315 > > this is opening the file, and is trying to down_write cinode->lock_sem > > > [0 00:00:00.041] [UN] PID: 8860 TASK: ffff8c691547ae80 CPU: 2 > COMMAND: "reopen_file" > [0 00:00:00.057] [UN] PID: 8861 TASK: ffff8c6915478000 CPU: 3 > COMMAND: "reopen_file" > [0 00:00:00.059] [UN] PID: 8858 TASK: ffff8c6914271740 CPU: 2 > COMMAND: "reopen_file" > [0 00:00:00.109] [UN] PID: 8862 TASK: ffff8c691547dd00 CPU: 6 > COMMAND: "reopen_file" > #0 [ffff9965007c3c78] __schedule at ffffffff9b6e6095 > #1 [ffff9965007c3d08] schedule at ffffffff9b6e64df > #2 [ffff9965007c3d18] schedule_timeout at ffffffff9b6e9f89 > #3 [ffff9965007c3d90] msleep at ffffffff9af573a9 > #4 [ffff9965007c3d98] _cifsFileInfo_put.cold.63 at ffffffffc0a42dd6 [cifs] > #5 [ffff9965007c3e88] cifs_close at ffffffffc0a07aaf [cifs] > #6 [ffff9965007c3ea0] __fput at ffffffff9b0efa6e > #7 [ffff9965007c3ee8] task_work_run at ffffffff9aef1614 > #8 [ffff9965007c3f20] exit_to_usermode_loop at ffffffff9ae03d6f > #9 [ffff9965007c3f38] do_syscall_64 at ffffffff9ae0444c > > closing the file, and trying to down_write cifsi->lock_sem > > > [0 00:48:22.839] [UN] PID: 8857 TASK: ffff8c6914270000 CPU: 7 > COMMAND: "reopen_file" > #0 [ffff9965006a7cc8] __schedule at ffffffff9b6e6095 > #1 [ffff9965006a7d58] schedule at ffffffff9b6e64df > #2 [ffff9965006a7d68] io_schedule at ffffffff9b6e68e2 > #3 [ffff9965006a7d78] wait_on_page_bit at ffffffff9b03cac6 > #4 [ffff9965006a7e10] __filemap_fdatawait_range at ffffffff9b03b028 > #5 [ffff9965006a7ed8] filemap_write_and_wait at ffffffff9b040165 > #6 [ffff9965006a7ef0] cifs_flush at ffffffffc0a0c2fa [cifs] > #7 [ffff9965006a7f10] filp_close at ffffffff9b0e93f1 > #8 [ffff9965006a7f30] __x64_sys_close at ffffffff9b0e9a0e > #9 [ffff9965006a7f38] do_syscall_64 at ffffffff9ae04315 > > in __filemap_fdatawait_range > wait_on_page_writeback(page); > for the same page of the file > > > > [0 00:48:22.718] [UN] PID: 8855 TASK: ffff8c69142745c0 CPU: 7 > COMMAND: "reopen_file" > #0 [ffff9965005dfc98] __schedule at ffffffff9b6e6095 > #1 [ffff9965005dfd28] schedule at ffffffff9b6e64df > #2 [ffff9965005dfd38] rwsem_down_write_slowpath at ffffffff9af283d7 > #3 [ffff9965005dfdf0] cifs_strict_writev at ffffffffc0a0c40a [cifs] > #4 [ffff9965005dfe48] new_sync_write at ffffffff9b0ec1dd > #5 [ffff9965005dfed0] vfs_write at ffffffff9b0eed35 > #6 [ffff9965005dff00] ksys_write at ffffffff9b0eefd9 > #7 [ffff9965005dff38] do_syscall_64 at ffffffff9ae04315 > > inode_lock(inode); > > > and one 'ls' later on, to see whether the rest of the mount is available > (the test file is in the root, so we get blocked up on the directory > ->i_rwsem), so the entire mount is unavailable > > [0 00:36:26.473] [UN] PID: 9802 TASK: ffff8c691436ae80 CPU: 4 > COMMAND: "ls" > #0 [ffff996500393d28] __schedule at ffffffff9b6e6095 > #1 [ffff996500393db8] schedule at ffffffff9b6e64df > #2 [ffff996500393dc8] rwsem_down_read_slowpath at ffffffff9b6e9421 > #3 [ffff996500393e78] down_read_killable at ffffffff9b6e95e2 > #4 [ffff996500393e88] iterate_dir at ffffffff9b103c56 > #5 [ffff996500393ec8] ksys_getdents64 at ffffffff9b104b0c > #6 [ffff996500393f30] __x64_sys_getdents64 at ffffffff9b104bb6 > #7 [ffff996500393f38] do_syscall_64 at ffffffff9ae04315 > > in iterate_dir: > if (shared) > res = down_read_killable(&inode->i_rwsem); <<<< > else > res = down_write_killable(&inode->i_rwsem); > Reported-by: Frank Sorenson <sorenson@redhat.com> Reviewed-by: Pavel Shilovsky <pshilov@microsoft.com> Signed-off-by: Ronnie Sahlberg <lsahlber@redhat.com> Signed-off-by: Steve French <stfrench@microsoft.com>
2019-11-03 03:06:37 +00:00
out_destroy_fileinfo_put_wq:
destroy_workqueue(fileinfo_put_wq);
out_destroy_decrypt_wq:
destroy_workqueue(decrypt_wq);
CIFS: fix oplock break deadlocks When the final cifsFileInfo_put() is called from cifsiod and an oplock break work is queued, lockdep complains loudly: ============================================= [ INFO: possible recursive locking detected ] 4.11.0+ #21 Not tainted --------------------------------------------- kworker/0:2/78 is trying to acquire lock: ("cifsiod"){++++.+}, at: flush_work+0x215/0x350 but task is already holding lock: ("cifsiod"){++++.+}, at: process_one_work+0x255/0x8e0 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock("cifsiod"); lock("cifsiod"); *** DEADLOCK *** May be due to missing lock nesting notation 2 locks held by kworker/0:2/78: #0: ("cifsiod"){++++.+}, at: process_one_work+0x255/0x8e0 #1: ((&wdata->work)){+.+...}, at: process_one_work+0x255/0x8e0 stack backtrace: CPU: 0 PID: 78 Comm: kworker/0:2 Not tainted 4.11.0+ #21 Workqueue: cifsiod cifs_writev_complete Call Trace: dump_stack+0x85/0xc2 __lock_acquire+0x17dd/0x2260 ? match_held_lock+0x20/0x2b0 ? trace_hardirqs_off_caller+0x86/0x130 ? mark_lock+0xa6/0x920 lock_acquire+0xcc/0x260 ? lock_acquire+0xcc/0x260 ? flush_work+0x215/0x350 flush_work+0x236/0x350 ? flush_work+0x215/0x350 ? destroy_worker+0x170/0x170 __cancel_work_timer+0x17d/0x210 ? ___preempt_schedule+0x16/0x18 cancel_work_sync+0x10/0x20 cifsFileInfo_put+0x338/0x7f0 cifs_writedata_release+0x2a/0x40 ? cifs_writedata_release+0x2a/0x40 cifs_writev_complete+0x29d/0x850 ? preempt_count_sub+0x18/0xd0 process_one_work+0x304/0x8e0 worker_thread+0x9b/0x6a0 kthread+0x1b2/0x200 ? process_one_work+0x8e0/0x8e0 ? kthread_create_on_node+0x40/0x40 ret_from_fork+0x31/0x40 This is a real warning. Since the oplock is queued on the same workqueue this can deadlock if there is only one worker thread active for the workqueue (which will be the case during memory pressure when the rescuer thread is handling it). Furthermore, there is at least one other kind of hang possible due to the oplock break handling if there is only worker. (This can be reproduced without introducing memory pressure by having passing 1 for the max_active parameter of cifsiod.) cifs_oplock_break() can wait indefintely in the filemap_fdatawait() while the cifs_writev_complete() work is blocked: sysrq: SysRq : Show Blocked State task PC stack pid father kworker/0:1 D 0 16 2 0x00000000 Workqueue: cifsiod cifs_oplock_break Call Trace: __schedule+0x562/0xf40 ? mark_held_locks+0x4a/0xb0 schedule+0x57/0xe0 io_schedule+0x21/0x50 wait_on_page_bit+0x143/0x190 ? add_to_page_cache_lru+0x150/0x150 __filemap_fdatawait_range+0x134/0x190 ? do_writepages+0x51/0x70 filemap_fdatawait_range+0x14/0x30 filemap_fdatawait+0x3b/0x40 cifs_oplock_break+0x651/0x710 ? preempt_count_sub+0x18/0xd0 process_one_work+0x304/0x8e0 worker_thread+0x9b/0x6a0 kthread+0x1b2/0x200 ? process_one_work+0x8e0/0x8e0 ? kthread_create_on_node+0x40/0x40 ret_from_fork+0x31/0x40 dd D 0 683 171 0x00000000 Call Trace: __schedule+0x562/0xf40 ? mark_held_locks+0x29/0xb0 schedule+0x57/0xe0 io_schedule+0x21/0x50 wait_on_page_bit+0x143/0x190 ? add_to_page_cache_lru+0x150/0x150 __filemap_fdatawait_range+0x134/0x190 ? do_writepages+0x51/0x70 filemap_fdatawait_range+0x14/0x30 filemap_fdatawait+0x3b/0x40 filemap_write_and_wait+0x4e/0x70 cifs_flush+0x6a/0xb0 filp_close+0x52/0xa0 __close_fd+0xdc/0x150 SyS_close+0x33/0x60 entry_SYSCALL_64_fastpath+0x1f/0xbe Showing all locks held in the system: 2 locks held by kworker/0:1/16: #0: ("cifsiod"){.+.+.+}, at: process_one_work+0x255/0x8e0 #1: ((&cfile->oplock_break)){+.+.+.}, at: process_one_work+0x255/0x8e0 Showing busy workqueues and worker pools: workqueue cifsiod: flags=0xc pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=1/1 in-flight: 16:cifs_oplock_break delayed: cifs_writev_complete, cifs_echo_request pool 0: cpus=0 node=0 flags=0x0 nice=0 hung=0s workers=3 idle: 750 3 Fix these problems by creating a a new workqueue (with a rescuer) for the oplock break work. Signed-off-by: Rabin Vincent <rabinv@axis.com> Signed-off-by: Steve French <smfrench@gmail.com> CC: Stable <stable@vger.kernel.org>
2017-05-03 15:54:01 +00:00
out_destroy_cifsiod_wq:
destroy_workqueue(cifsiod_wq);
out_clean_proc:
cifs_proc_clean();
return rc;
}
static void __exit
exit_cifs(void)
{
cifs_dbg(NOISY, "exit_smb3\n");
unregister_filesystem(&cifs_fs_type);
unregister_filesystem(&smb3_fs_type);
cifs_release_automount_timer();
exit_cifs_idmap();
#ifdef CONFIG_CIFS_SWN_UPCALL
cifs_genl_exit();
#endif
#ifdef CONFIG_CIFS_UPCALL
exit_cifs_spnego();
#endif
#ifdef CONFIG_CIFS_DFS_UPCALL
dfs_cache_destroy();
#endif
cifs_destroy_request_bufs();
destroy_mids();
cifs_destroy_inodecache();
destroy_workqueue(deferredclose_wq);
CIFS: fix oplock break deadlocks When the final cifsFileInfo_put() is called from cifsiod and an oplock break work is queued, lockdep complains loudly: ============================================= [ INFO: possible recursive locking detected ] 4.11.0+ #21 Not tainted --------------------------------------------- kworker/0:2/78 is trying to acquire lock: ("cifsiod"){++++.+}, at: flush_work+0x215/0x350 but task is already holding lock: ("cifsiod"){++++.+}, at: process_one_work+0x255/0x8e0 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock("cifsiod"); lock("cifsiod"); *** DEADLOCK *** May be due to missing lock nesting notation 2 locks held by kworker/0:2/78: #0: ("cifsiod"){++++.+}, at: process_one_work+0x255/0x8e0 #1: ((&wdata->work)){+.+...}, at: process_one_work+0x255/0x8e0 stack backtrace: CPU: 0 PID: 78 Comm: kworker/0:2 Not tainted 4.11.0+ #21 Workqueue: cifsiod cifs_writev_complete Call Trace: dump_stack+0x85/0xc2 __lock_acquire+0x17dd/0x2260 ? match_held_lock+0x20/0x2b0 ? trace_hardirqs_off_caller+0x86/0x130 ? mark_lock+0xa6/0x920 lock_acquire+0xcc/0x260 ? lock_acquire+0xcc/0x260 ? flush_work+0x215/0x350 flush_work+0x236/0x350 ? flush_work+0x215/0x350 ? destroy_worker+0x170/0x170 __cancel_work_timer+0x17d/0x210 ? ___preempt_schedule+0x16/0x18 cancel_work_sync+0x10/0x20 cifsFileInfo_put+0x338/0x7f0 cifs_writedata_release+0x2a/0x40 ? cifs_writedata_release+0x2a/0x40 cifs_writev_complete+0x29d/0x850 ? preempt_count_sub+0x18/0xd0 process_one_work+0x304/0x8e0 worker_thread+0x9b/0x6a0 kthread+0x1b2/0x200 ? process_one_work+0x8e0/0x8e0 ? kthread_create_on_node+0x40/0x40 ret_from_fork+0x31/0x40 This is a real warning. Since the oplock is queued on the same workqueue this can deadlock if there is only one worker thread active for the workqueue (which will be the case during memory pressure when the rescuer thread is handling it). Furthermore, there is at least one other kind of hang possible due to the oplock break handling if there is only worker. (This can be reproduced without introducing memory pressure by having passing 1 for the max_active parameter of cifsiod.) cifs_oplock_break() can wait indefintely in the filemap_fdatawait() while the cifs_writev_complete() work is blocked: sysrq: SysRq : Show Blocked State task PC stack pid father kworker/0:1 D 0 16 2 0x00000000 Workqueue: cifsiod cifs_oplock_break Call Trace: __schedule+0x562/0xf40 ? mark_held_locks+0x4a/0xb0 schedule+0x57/0xe0 io_schedule+0x21/0x50 wait_on_page_bit+0x143/0x190 ? add_to_page_cache_lru+0x150/0x150 __filemap_fdatawait_range+0x134/0x190 ? do_writepages+0x51/0x70 filemap_fdatawait_range+0x14/0x30 filemap_fdatawait+0x3b/0x40 cifs_oplock_break+0x651/0x710 ? preempt_count_sub+0x18/0xd0 process_one_work+0x304/0x8e0 worker_thread+0x9b/0x6a0 kthread+0x1b2/0x200 ? process_one_work+0x8e0/0x8e0 ? kthread_create_on_node+0x40/0x40 ret_from_fork+0x31/0x40 dd D 0 683 171 0x00000000 Call Trace: __schedule+0x562/0xf40 ? mark_held_locks+0x29/0xb0 schedule+0x57/0xe0 io_schedule+0x21/0x50 wait_on_page_bit+0x143/0x190 ? add_to_page_cache_lru+0x150/0x150 __filemap_fdatawait_range+0x134/0x190 ? do_writepages+0x51/0x70 filemap_fdatawait_range+0x14/0x30 filemap_fdatawait+0x3b/0x40 filemap_write_and_wait+0x4e/0x70 cifs_flush+0x6a/0xb0 filp_close+0x52/0xa0 __close_fd+0xdc/0x150 SyS_close+0x33/0x60 entry_SYSCALL_64_fastpath+0x1f/0xbe Showing all locks held in the system: 2 locks held by kworker/0:1/16: #0: ("cifsiod"){.+.+.+}, at: process_one_work+0x255/0x8e0 #1: ((&cfile->oplock_break)){+.+.+.}, at: process_one_work+0x255/0x8e0 Showing busy workqueues and worker pools: workqueue cifsiod: flags=0xc pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=1/1 in-flight: 16:cifs_oplock_break delayed: cifs_writev_complete, cifs_echo_request pool 0: cpus=0 node=0 flags=0x0 nice=0 hung=0s workers=3 idle: 750 3 Fix these problems by creating a a new workqueue (with a rescuer) for the oplock break work. Signed-off-by: Rabin Vincent <rabinv@axis.com> Signed-off-by: Steve French <smfrench@gmail.com> CC: Stable <stable@vger.kernel.org>
2017-05-03 15:54:01 +00:00
destroy_workqueue(cifsoplockd_wq);
destroy_workqueue(decrypt_wq);
cifs: move cifsFileInfo_put logic into a work-queue This patch moves the final part of the cifsFileInfo_put() logic where we need a write lock on lock_sem to be processed in a separate thread that holds no other locks. This is to prevent deadlocks like the one below: > there are 6 processes looping to while trying to down_write > cinode->lock_sem, 5 of them from _cifsFileInfo_put, and one from > cifs_new_fileinfo > > and there are 5 other processes which are blocked, several of them > waiting on either PG_writeback or PG_locked (which are both set), all > for the same page of the file > > 2 inode_lock() (inode->i_rwsem) for the file > 1 wait_on_page_writeback() for the page > 1 down_read(inode->i_rwsem) for the inode of the directory > 1 inode_lock()(inode->i_rwsem) for the inode of the directory > 1 __lock_page > > > so processes are blocked waiting on: > page flags PG_locked and PG_writeback for one specific page > inode->i_rwsem for the directory > inode->i_rwsem for the file > cifsInodeInflock_sem > > > > here are the more gory details (let me know if I need to provide > anything more/better): > > [0 00:48:22.765] [UN] PID: 8863 TASK: ffff8c691547c5c0 CPU: 3 > COMMAND: "reopen_file" > #0 [ffff9965007e3ba8] __schedule at ffffffff9b6e6095 > #1 [ffff9965007e3c38] schedule at ffffffff9b6e64df > #2 [ffff9965007e3c48] rwsem_down_write_slowpath at ffffffff9af283d7 > #3 [ffff9965007e3cb8] legitimize_path at ffffffff9b0f975d > #4 [ffff9965007e3d08] path_openat at ffffffff9b0fe55d > #5 [ffff9965007e3dd8] do_filp_open at ffffffff9b100a33 > #6 [ffff9965007e3ee0] do_sys_open at ffffffff9b0eb2d6 > #7 [ffff9965007e3f38] do_syscall_64 at ffffffff9ae04315 > * (I think legitimize_path is bogus) > > in path_openat > } else { > const char *s = path_init(nd, flags); > while (!(error = link_path_walk(s, nd)) && > (error = do_last(nd, file, op)) > 0) { <<<< > > do_last: > if (open_flag & O_CREAT) > inode_lock(dir->d_inode); <<<< > else > so it's trying to take inode->i_rwsem for the directory > > DENTRY INODE SUPERBLK TYPE PATH > ffff8c68bb8e79c0 ffff8c691158ef20 ffff8c6915bf9000 DIR /mnt/vm1_smb/ > inode.i_rwsem is ffff8c691158efc0 > > <struct rw_semaphore 0xffff8c691158efc0>: > owner: <struct task_struct 0xffff8c6914275d00> (UN - 8856 - > reopen_file), counter: 0x0000000000000003 > waitlist: 2 > 0xffff9965007e3c90 8863 reopen_file UN 0 1:29:22.926 > RWSEM_WAITING_FOR_WRITE > 0xffff996500393e00 9802 ls UN 0 1:17:26.700 > RWSEM_WAITING_FOR_READ > > > the owner of the inode.i_rwsem of the directory is: > > [0 00:00:00.109] [UN] PID: 8856 TASK: ffff8c6914275d00 CPU: 3 > COMMAND: "reopen_file" > #0 [ffff99650065b828] __schedule at ffffffff9b6e6095 > #1 [ffff99650065b8b8] schedule at ffffffff9b6e64df > #2 [ffff99650065b8c8] schedule_timeout at ffffffff9b6e9f89 > #3 [ffff99650065b940] msleep at ffffffff9af573a9 > #4 [ffff99650065b948] _cifsFileInfo_put.cold.63 at ffffffffc0a42dd6 [cifs] > #5 [ffff99650065ba38] cifs_writepage_locked at ffffffffc0a0b8f3 [cifs] > #6 [ffff99650065bab0] cifs_launder_page at ffffffffc0a0bb72 [cifs] > #7 [ffff99650065bb30] invalidate_inode_pages2_range at ffffffff9b04d4bd > #8 [ffff99650065bcb8] cifs_invalidate_mapping at ffffffffc0a11339 [cifs] > #9 [ffff99650065bcd0] cifs_revalidate_mapping at ffffffffc0a1139a [cifs] > #10 [ffff99650065bcf0] cifs_d_revalidate at ffffffffc0a014f6 [cifs] > #11 [ffff99650065bd08] path_openat at ffffffff9b0fe7f7 > #12 [ffff99650065bdd8] do_filp_open at ffffffff9b100a33 > #13 [ffff99650065bee0] do_sys_open at ffffffff9b0eb2d6 > #14 [ffff99650065bf38] do_syscall_64 at ffffffff9ae04315 > > cifs_launder_page is for page 0xffffd1e2c07d2480 > > crash> page.index,mapping,flags 0xffffd1e2c07d2480 > index = 0x8 > mapping = 0xffff8c68f3cd0db0 > flags = 0xfffffc0008095 > > PAGE-FLAG BIT VALUE > PG_locked 0 0000001 > PG_uptodate 2 0000004 > PG_lru 4 0000010 > PG_waiters 7 0000080 > PG_writeback 15 0008000 > > > inode is ffff8c68f3cd0c40 > inode.i_rwsem is ffff8c68f3cd0ce0 > DENTRY INODE SUPERBLK TYPE PATH > ffff8c68a1f1b480 ffff8c68f3cd0c40 ffff8c6915bf9000 REG > /mnt/vm1_smb/testfile.8853 > > > this process holds the inode->i_rwsem for the parent directory, is > laundering a page attached to the inode of the file it's opening, and in > _cifsFileInfo_put is trying to down_write the cifsInodeInflock_sem > for the file itself. > > > <struct rw_semaphore 0xffff8c68f3cd0ce0>: > owner: <struct task_struct 0xffff8c6914272e80> (UN - 8854 - > reopen_file), counter: 0x0000000000000003 > waitlist: 1 > 0xffff9965005dfd80 8855 reopen_file UN 0 1:29:22.912 > RWSEM_WAITING_FOR_WRITE > > this is the inode.i_rwsem for the file > > the owner: > > [0 00:48:22.739] [UN] PID: 8854 TASK: ffff8c6914272e80 CPU: 2 > COMMAND: "reopen_file" > #0 [ffff99650054fb38] __schedule at ffffffff9b6e6095 > #1 [ffff99650054fbc8] schedule at ffffffff9b6e64df > #2 [ffff99650054fbd8] io_schedule at ffffffff9b6e68e2 > #3 [ffff99650054fbe8] __lock_page at ffffffff9b03c56f > #4 [ffff99650054fc80] pagecache_get_page at ffffffff9b03dcdf > #5 [ffff99650054fcc0] grab_cache_page_write_begin at ffffffff9b03ef4c > #6 [ffff99650054fcd0] cifs_write_begin at ffffffffc0a064ec [cifs] > #7 [ffff99650054fd30] generic_perform_write at ffffffff9b03bba4 > #8 [ffff99650054fda8] __generic_file_write_iter at ffffffff9b04060a > #9 [ffff99650054fdf0] cifs_strict_writev.cold.70 at ffffffffc0a4469b [cifs] > #10 [ffff99650054fe48] new_sync_write at ffffffff9b0ec1dd > #11 [ffff99650054fed0] vfs_write at ffffffff9b0eed35 > #12 [ffff99650054ff00] ksys_write at ffffffff9b0eefd9 > #13 [ffff99650054ff38] do_syscall_64 at ffffffff9ae04315 > > the process holds the inode->i_rwsem for the file to which it's writing, > and is trying to __lock_page for the same page as in the other processes > > > the other tasks: > [0 00:00:00.028] [UN] PID: 8859 TASK: ffff8c6915479740 CPU: 2 > COMMAND: "reopen_file" > #0 [ffff9965007b39d8] __schedule at ffffffff9b6e6095 > #1 [ffff9965007b3a68] schedule at ffffffff9b6e64df > #2 [ffff9965007b3a78] schedule_timeout at ffffffff9b6e9f89 > #3 [ffff9965007b3af0] msleep at ffffffff9af573a9 > #4 [ffff9965007b3af8] cifs_new_fileinfo.cold.61 at ffffffffc0a42a07 [cifs] > #5 [ffff9965007b3b78] cifs_open at ffffffffc0a0709d [cifs] > #6 [ffff9965007b3cd8] do_dentry_open at ffffffff9b0e9b7a > #7 [ffff9965007b3d08] path_openat at ffffffff9b0fe34f > #8 [ffff9965007b3dd8] do_filp_open at ffffffff9b100a33 > #9 [ffff9965007b3ee0] do_sys_open at ffffffff9b0eb2d6 > #10 [ffff9965007b3f38] do_syscall_64 at ffffffff9ae04315 > > this is opening the file, and is trying to down_write cinode->lock_sem > > > [0 00:00:00.041] [UN] PID: 8860 TASK: ffff8c691547ae80 CPU: 2 > COMMAND: "reopen_file" > [0 00:00:00.057] [UN] PID: 8861 TASK: ffff8c6915478000 CPU: 3 > COMMAND: "reopen_file" > [0 00:00:00.059] [UN] PID: 8858 TASK: ffff8c6914271740 CPU: 2 > COMMAND: "reopen_file" > [0 00:00:00.109] [UN] PID: 8862 TASK: ffff8c691547dd00 CPU: 6 > COMMAND: "reopen_file" > #0 [ffff9965007c3c78] __schedule at ffffffff9b6e6095 > #1 [ffff9965007c3d08] schedule at ffffffff9b6e64df > #2 [ffff9965007c3d18] schedule_timeout at ffffffff9b6e9f89 > #3 [ffff9965007c3d90] msleep at ffffffff9af573a9 > #4 [ffff9965007c3d98] _cifsFileInfo_put.cold.63 at ffffffffc0a42dd6 [cifs] > #5 [ffff9965007c3e88] cifs_close at ffffffffc0a07aaf [cifs] > #6 [ffff9965007c3ea0] __fput at ffffffff9b0efa6e > #7 [ffff9965007c3ee8] task_work_run at ffffffff9aef1614 > #8 [ffff9965007c3f20] exit_to_usermode_loop at ffffffff9ae03d6f > #9 [ffff9965007c3f38] do_syscall_64 at ffffffff9ae0444c > > closing the file, and trying to down_write cifsi->lock_sem > > > [0 00:48:22.839] [UN] PID: 8857 TASK: ffff8c6914270000 CPU: 7 > COMMAND: "reopen_file" > #0 [ffff9965006a7cc8] __schedule at ffffffff9b6e6095 > #1 [ffff9965006a7d58] schedule at ffffffff9b6e64df > #2 [ffff9965006a7d68] io_schedule at ffffffff9b6e68e2 > #3 [ffff9965006a7d78] wait_on_page_bit at ffffffff9b03cac6 > #4 [ffff9965006a7e10] __filemap_fdatawait_range at ffffffff9b03b028 > #5 [ffff9965006a7ed8] filemap_write_and_wait at ffffffff9b040165 > #6 [ffff9965006a7ef0] cifs_flush at ffffffffc0a0c2fa [cifs] > #7 [ffff9965006a7f10] filp_close at ffffffff9b0e93f1 > #8 [ffff9965006a7f30] __x64_sys_close at ffffffff9b0e9a0e > #9 [ffff9965006a7f38] do_syscall_64 at ffffffff9ae04315 > > in __filemap_fdatawait_range > wait_on_page_writeback(page); > for the same page of the file > > > > [0 00:48:22.718] [UN] PID: 8855 TASK: ffff8c69142745c0 CPU: 7 > COMMAND: "reopen_file" > #0 [ffff9965005dfc98] __schedule at ffffffff9b6e6095 > #1 [ffff9965005dfd28] schedule at ffffffff9b6e64df > #2 [ffff9965005dfd38] rwsem_down_write_slowpath at ffffffff9af283d7 > #3 [ffff9965005dfdf0] cifs_strict_writev at ffffffffc0a0c40a [cifs] > #4 [ffff9965005dfe48] new_sync_write at ffffffff9b0ec1dd > #5 [ffff9965005dfed0] vfs_write at ffffffff9b0eed35 > #6 [ffff9965005dff00] ksys_write at ffffffff9b0eefd9 > #7 [ffff9965005dff38] do_syscall_64 at ffffffff9ae04315 > > inode_lock(inode); > > > and one 'ls' later on, to see whether the rest of the mount is available > (the test file is in the root, so we get blocked up on the directory > ->i_rwsem), so the entire mount is unavailable > > [0 00:36:26.473] [UN] PID: 9802 TASK: ffff8c691436ae80 CPU: 4 > COMMAND: "ls" > #0 [ffff996500393d28] __schedule at ffffffff9b6e6095 > #1 [ffff996500393db8] schedule at ffffffff9b6e64df > #2 [ffff996500393dc8] rwsem_down_read_slowpath at ffffffff9b6e9421 > #3 [ffff996500393e78] down_read_killable at ffffffff9b6e95e2 > #4 [ffff996500393e88] iterate_dir at ffffffff9b103c56 > #5 [ffff996500393ec8] ksys_getdents64 at ffffffff9b104b0c > #6 [ffff996500393f30] __x64_sys_getdents64 at ffffffff9b104bb6 > #7 [ffff996500393f38] do_syscall_64 at ffffffff9ae04315 > > in iterate_dir: > if (shared) > res = down_read_killable(&inode->i_rwsem); <<<< > else > res = down_write_killable(&inode->i_rwsem); > Reported-by: Frank Sorenson <sorenson@redhat.com> Reviewed-by: Pavel Shilovsky <pshilov@microsoft.com> Signed-off-by: Ronnie Sahlberg <lsahlber@redhat.com> Signed-off-by: Steve French <stfrench@microsoft.com>
2019-11-03 03:06:37 +00:00
destroy_workqueue(fileinfo_put_wq);
destroy_workqueue(cifsiod_wq);
cifs_proc_clean();
}
MODULE_AUTHOR("Steve French");
MODULE_LICENSE("GPL"); /* combination of LGPL + GPL source behaves as GPL */
MODULE_DESCRIPTION
("VFS to access SMB3 servers e.g. Samba, Macs, Azure and Windows (and "
"also older servers complying with the SNIA CIFS Specification)");
MODULE_VERSION(CIFS_VERSION);
MODULE_SOFTDEP("ecb");
MODULE_SOFTDEP("hmac");
MODULE_SOFTDEP("md5");
MODULE_SOFTDEP("nls");
MODULE_SOFTDEP("aes");
MODULE_SOFTDEP("cmac");
MODULE_SOFTDEP("sha256");
MODULE_SOFTDEP("sha512");
MODULE_SOFTDEP("aead2");
MODULE_SOFTDEP("ccm");
MODULE_SOFTDEP("gcm");
module_init(init_cifs)
module_exit(exit_cifs)