diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index 18d69a4559d6..a4d64b1b4295 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -270,7 +270,10 @@ or bottom half). ->alloc_inode. ``dirty_inode`` - this method is called by the VFS to mark an inode dirty. + this method is called by the VFS when an inode is marked dirty. + This is specifically for the inode itself being marked dirty, + not its data. If the update needs to be persisted by fdatasync(), + then I_DIRTY_DATASYNC will be set in the flags argument. ``write_inode`` this method is called when the VFS needs to write an inode to diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c173c8405856..de7905284e28 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4961,15 +4961,11 @@ static void __ext4_update_other_inode_time(struct super_block *sb, if (!inode) return; - if ((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW | - I_DIRTY_INODE)) || - ((inode->i_state & I_DIRTY_TIME) == 0)) + if (!inode_is_dirtytime_only(inode)) return; spin_lock(&inode->i_lock); - if (((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW | - I_DIRTY_INODE)) == 0) && - (inode->i_state & I_DIRTY_TIME)) { + if (inode_is_dirtytime_only(inode)) { struct ext4_inode_info *ei = EXT4_I(inode); inode->i_state &= ~I_DIRTY_TIME; @@ -5937,26 +5933,16 @@ int __ext4_mark_inode_dirty(handle_t *handle, struct inode *inode, * If the inode is marked synchronous, we don't honour that here - doing * so would cause a commit on atime updates, which we don't bother doing. * We handle synchronous inodes at the highest possible level. - * - * If only the I_DIRTY_TIME flag is set, we can skip everything. If - * I_DIRTY_TIME and I_DIRTY_SYNC is set, the only inode fields we need - * to copy into the on-disk inode structure are the timestamp files. */ void ext4_dirty_inode(struct inode *inode, int flags) { handle_t *handle; - if (flags == I_DIRTY_TIME) - return; handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) - goto out; - + return; ext4_mark_inode_dirty(handle, inode); - ext4_journal_stop(handle); -out: - return; } int ext4_change_inode_journal_flag(struct inode *inode, int val) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 4acfa7d36731..7069793752f1 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1300,9 +1300,6 @@ static void f2fs_dirty_inode(struct inode *inode, int flags) inode->i_ino == F2FS_META_INO(sbi)) return; - if (flags == I_DIRTY_TIME) - return; - if (is_inode_flag_set(inode, FI_AUTO_RECOVER)) clear_inode_flag(inode, FI_AUTO_RECOVER); diff --git a/fs/fat/misc.c b/fs/fat/misc.c index f1b2a1fc2a6a..18a50a46b57f 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -329,22 +329,23 @@ EXPORT_SYMBOL_GPL(fat_truncate_time); int fat_update_time(struct inode *inode, struct timespec64 *now, int flags) { - int iflags = I_DIRTY_TIME; - bool dirty = false; + int dirty_flags = 0; if (inode->i_ino == MSDOS_ROOT_INO) return 0; - fat_truncate_time(inode, now, flags); - if (flags & S_VERSION) - dirty = inode_maybe_inc_iversion(inode, false); - if ((flags & (S_ATIME | S_CTIME | S_MTIME)) && - !(inode->i_sb->s_flags & SB_LAZYTIME)) - dirty = true; + if (flags & (S_ATIME | S_CTIME | S_MTIME)) { + fat_truncate_time(inode, now, flags); + if (inode->i_sb->s_flags & SB_LAZYTIME) + dirty_flags |= I_DIRTY_TIME; + else + dirty_flags |= I_DIRTY_SYNC; + } - if (dirty) - iflags |= I_DIRTY_SYNC; - __mark_inode_dirty(inode, iflags); + if ((flags & S_VERSION) && inode_maybe_inc_iversion(inode, false)) + dirty_flags |= I_DIRTY_SYNC; + + __mark_inode_dirty(inode, dirty_flags); return 0; } EXPORT_SYMBOL_GPL(fat_update_time); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index c41cb887eb7d..e91980f49388 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1442,9 +1442,15 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, } /* - * Write out an inode and its dirty pages. Do not update the writeback list - * linkage. That is left to the caller. The caller is also responsible for - * setting I_SYNC flag and calling inode_sync_complete() to clear it. + * Write out an inode and its dirty pages (or some of its dirty pages, depending + * on @wbc->nr_to_write), and clear the relevant dirty flags from i_state. + * + * This doesn't remove the inode from the writeback list it is on, except + * potentially to move it from b_dirty_time to b_dirty due to timestamp + * expiration. The caller is otherwise responsible for writeback list handling. + * + * The caller is also responsible for setting the I_SYNC flag beforehand and + * calling inode_sync_complete() to clear it afterwards. */ static int __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) @@ -1479,7 +1485,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * change I_DIRTY_TIME into I_DIRTY_SYNC. */ if ((inode->i_state & I_DIRTY_TIME) && - (wbc->sync_mode == WB_SYNC_ALL || wbc->for_sync || + (wbc->sync_mode == WB_SYNC_ALL || time_after(jiffies, inode->dirtied_time_when + dirtytime_expire_interval * HZ))) { trace_writeback_lazytime(inode); @@ -1487,9 +1493,10 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) } /* - * Some filesystems may redirty the inode during the writeback - * due to delalloc, clear dirty metadata flags right before - * write_inode() + * Get and clear the dirty flags from i_state. This needs to be done + * after calling writepages because some filesystems may redirty the + * inode during writepages due to delalloc. It also needs to be done + * after handling timestamp expiration, as that may dirty the inode too. */ spin_lock(&inode->i_lock); dirty = inode->i_state & I_DIRTY; @@ -1524,12 +1531,13 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) } /* - * Write out an inode's dirty pages. Either the caller has an active reference - * on the inode or the inode has I_WILL_FREE set. + * Write out an inode's dirty data and metadata on-demand, i.e. separately from + * the regular batched writeback done by the flusher threads in + * writeback_sb_inodes(). @wbc controls various aspects of the write, such as + * whether it is a data-integrity sync (%WB_SYNC_ALL) or not (%WB_SYNC_NONE). * - * This function is designed to be called for writing back one inode which - * we go e.g. from filesystem. Flusher thread uses __writeback_single_inode() - * and does more profound writeback list handling in writeback_sb_inodes(). + * To prevent the inode from going away, either the caller must have a reference + * to the inode, or the inode must have I_WILL_FREE or I_FREEING set. */ static int writeback_single_inode(struct inode *inode, struct writeback_control *wbc) @@ -1544,23 +1552,23 @@ static int writeback_single_inode(struct inode *inode, WARN_ON(inode->i_state & I_WILL_FREE); if (inode->i_state & I_SYNC) { + /* + * Writeback is already running on the inode. For WB_SYNC_NONE, + * that's enough and we can just return. For WB_SYNC_ALL, we + * must wait for the existing writeback to complete, then do + * writeback again if there's anything left. + */ if (wbc->sync_mode != WB_SYNC_ALL) goto out; - /* - * It's a data-integrity sync. We must wait. Since callers hold - * inode reference or inode has I_WILL_FREE set, it cannot go - * away under us. - */ __inode_wait_for_writeback(inode); } WARN_ON(inode->i_state & I_SYNC); /* - * Skip inode if it is clean and we have no outstanding writeback in - * WB_SYNC_ALL mode. We don't want to mess with writeback lists in this - * function since flusher thread may be doing for example sync in - * parallel and if we move the inode, it could get skipped. So here we - * make sure inode is on some writeback list and leave it there unless - * we have completely cleaned the inode. + * If the inode is already fully clean, then there's nothing to do. + * + * For data-integrity syncs we also need to check whether any pages are + * still under writeback, e.g. due to prior WB_SYNC_NONE writeback. If + * there are any such pages, we'll need to wait for them. */ if (!(inode->i_state & I_DIRTY_ALL) && (wbc->sync_mode != WB_SYNC_ALL || @@ -1576,8 +1584,9 @@ static int writeback_single_inode(struct inode *inode, wb = inode_to_wb_and_lock_list(inode); spin_lock(&inode->i_lock); /* - * If inode is clean, remove it from writeback lists. Otherwise don't - * touch it. See comment above for explanation. + * If the inode is now fully clean, then it can be safely removed from + * its writeback list (if any). Otherwise the flusher threads are + * responsible for the writeback lists. */ if (!(inode->i_state & I_DIRTY_ALL)) inode_io_list_del_locked(inode, wb); @@ -2219,23 +2228,24 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode) } /** - * __mark_inode_dirty - internal function + * __mark_inode_dirty - internal function to mark an inode dirty * * @inode: inode to mark - * @flags: what kind of dirty (i.e. I_DIRTY_SYNC) + * @flags: what kind of dirty, e.g. I_DIRTY_SYNC. This can be a combination of + * multiple I_DIRTY_* flags, except that I_DIRTY_TIME can't be combined + * with I_DIRTY_PAGES. * - * Mark an inode as dirty. Callers should use mark_inode_dirty or - * mark_inode_dirty_sync. + * Mark an inode as dirty. We notify the filesystem, then update the inode's + * dirty flags. Then, if needed we add the inode to the appropriate dirty list. * - * Put the inode on the super block's dirty list. + * Most callers should use mark_inode_dirty() or mark_inode_dirty_sync() + * instead of calling this directly. * - * CAREFUL! We mark it dirty unconditionally, but move it onto the - * dirty list only if it is hashed or if it refers to a blockdev. - * If it was not hashed, it will never be added to the dirty list - * even if it is later hashed, as it will have been marked dirty already. + * CAREFUL! We only add the inode to the dirty list if it is hashed or if it + * refers to a blockdev. Unhashed inodes will never be added to the dirty list + * even if they are later hashed, as they will have been marked dirty already. * - * In short, make sure you hash any inodes _before_ you start marking - * them dirty. + * In short, ensure you hash any inodes _before_ you start marking them dirty. * * Note that for blockdevs, inode->dirtied_when represents the dirtying time of * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of @@ -2247,25 +2257,34 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode) void __mark_inode_dirty(struct inode *inode, int flags) { struct super_block *sb = inode->i_sb; - int dirtytime; + int dirtytime = 0; trace_writeback_mark_inode_dirty(inode, flags); - /* - * Don't do this for I_DIRTY_PAGES - that doesn't actually - * dirty the inode itself - */ - if (flags & (I_DIRTY_INODE | I_DIRTY_TIME)) { + if (flags & I_DIRTY_INODE) { + /* + * Notify the filesystem about the inode being dirtied, so that + * (if needed) it can update on-disk fields and journal the + * inode. This is only needed when the inode itself is being + * dirtied now. I.e. it's only needed for I_DIRTY_INODE, not + * for just I_DIRTY_PAGES or I_DIRTY_TIME. + */ trace_writeback_dirty_inode_start(inode, flags); - if (sb->s_op->dirty_inode) - sb->s_op->dirty_inode(inode, flags); - + sb->s_op->dirty_inode(inode, flags & I_DIRTY_INODE); trace_writeback_dirty_inode(inode, flags); - } - if (flags & I_DIRTY_INODE) + + /* I_DIRTY_INODE supersedes I_DIRTY_TIME. */ flags &= ~I_DIRTY_TIME; - dirtytime = flags & I_DIRTY_TIME; + } else { + /* + * Else it's either I_DIRTY_PAGES, I_DIRTY_TIME, or nothing. + * (We don't support setting both I_DIRTY_PAGES and I_DIRTY_TIME + * in one call to __mark_inode_dirty().) + */ + dirtytime = flags & I_DIRTY_TIME; + WARN_ON_ONCE(dirtytime && flags != I_DIRTY_TIME); + } /* * Paired with smp_mb() in __writeback_single_inode() for the @@ -2288,6 +2307,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) inode_attach_wb(inode, NULL); + /* I_DIRTY_INODE supersedes I_DIRTY_TIME. */ if (flags & I_DIRTY_INODE) inode->i_state &= ~I_DIRTY_TIME; inode->i_state |= flags; diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 89609c299717..07f49e5e6304 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -749,7 +749,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end, { struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; - int sync_state = inode->i_state & I_DIRTY_ALL; + int sync_state = inode->i_state & I_DIRTY; struct gfs2_inode *ip = GFS2_I(inode); int ret = 0, ret1 = 0; @@ -762,7 +762,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end, if (!gfs2_is_jdata(ip)) sync_state &= ~I_DIRTY_PAGES; if (datasync) - sync_state &= ~(I_DIRTY_SYNC | I_DIRTY_TIME); + sync_state &= ~I_DIRTY_SYNC; if (sync_state) { ret = sync_inode_metadata(inode, 1); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 2f56acc41c04..042b94288ff1 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -562,8 +562,6 @@ static void gfs2_dirty_inode(struct inode *inode, int flags) int need_endtrans = 0; int ret; - if (!(flags & I_DIRTY_INODE)) - return; if (unlikely(gfs2_withdrawn(sdp))) return; if (!gfs2_glock_is_locked_by_me(ip->i_gl)) { diff --git a/fs/inode.c b/fs/inode.c index 1dc9e032f659..874242169547 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1743,24 +1743,26 @@ static int relatime_need_update(struct vfsmount *mnt, struct inode *inode, int generic_update_time(struct inode *inode, struct timespec64 *time, int flags) { - int iflags = I_DIRTY_TIME; - bool dirty = false; + int dirty_flags = 0; - if (flags & S_ATIME) - inode->i_atime = *time; - if (flags & S_VERSION) - dirty = inode_maybe_inc_iversion(inode, false); - if (flags & S_CTIME) - inode->i_ctime = *time; - if (flags & S_MTIME) - inode->i_mtime = *time; - if ((flags & (S_ATIME | S_CTIME | S_MTIME)) && - !(inode->i_sb->s_flags & SB_LAZYTIME)) - dirty = true; + if (flags & (S_ATIME | S_CTIME | S_MTIME)) { + if (flags & S_ATIME) + inode->i_atime = *time; + if (flags & S_CTIME) + inode->i_ctime = *time; + if (flags & S_MTIME) + inode->i_mtime = *time; - if (dirty) - iflags |= I_DIRTY_SYNC; - __mark_inode_dirty(inode, iflags); + if (inode->i_sb->s_flags & SB_LAZYTIME) + dirty_flags |= I_DIRTY_TIME; + else + dirty_flags |= I_DIRTY_SYNC; + } + + if ((flags & S_VERSION) && inode_maybe_inc_iversion(inode, false)) + dirty_flags |= I_DIRTY_SYNC; + + __mark_inode_dirty(inode, dirty_flags); return 0; } EXPORT_SYMBOL(generic_update_time); diff --git a/include/linux/fs.h b/include/linux/fs.h index 6d8b1e7337e4..43ba79ddbd68 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2084,8 +2084,8 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src, /* * Inode state bits. Protected by inode->i_lock * - * Three bits determine the dirty state of the inode, I_DIRTY_SYNC, - * I_DIRTY_DATASYNC and I_DIRTY_PAGES. + * Four bits determine the dirty state of the inode: I_DIRTY_SYNC, + * I_DIRTY_DATASYNC, I_DIRTY_PAGES, and I_DIRTY_TIME. * * Four bits define the lifetime of an inode. Initially, inodes are I_NEW, * until that flag is cleared. I_WILL_FREE, I_FREEING and I_CLEAR are set at @@ -2094,12 +2094,20 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src, * Two bits are used for locking and completion notification, I_NEW and I_SYNC. * * I_DIRTY_SYNC Inode is dirty, but doesn't have to be written on - * fdatasync(). i_atime is the usual cause. - * I_DIRTY_DATASYNC Data-related inode changes pending. We keep track of + * fdatasync() (unless I_DIRTY_DATASYNC is also set). + * Timestamp updates are the usual cause. + * I_DIRTY_DATASYNC Data-related inode changes pending. We keep track of * these changes separately from I_DIRTY_SYNC so that we * don't have to write inode on fdatasync() when only - * mtime has changed in it. + * e.g. the timestamps have changed. * I_DIRTY_PAGES Inode has dirty pages. Inode itself may be clean. + * I_DIRTY_TIME The inode itself only has dirty timestamps, and the + * lazytime mount option is enabled. We keep track of this + * separately from I_DIRTY_SYNC in order to implement + * lazytime. This gets cleared if I_DIRTY_INODE + * (I_DIRTY_SYNC and/or I_DIRTY_DATASYNC) gets set. I.e. + * either I_DIRTY_TIME *or* I_DIRTY_INODE can be set in + * i_state, but not both. I_DIRTY_PAGES may still be set. * I_NEW Serves as both a mutex and completion notification. * New inodes set I_NEW. If two processes both create * the same inode, one of them will release its inode and @@ -2186,6 +2194,21 @@ static inline void mark_inode_dirty_sync(struct inode *inode) __mark_inode_dirty(inode, I_DIRTY_SYNC); } +/* + * Returns true if the given inode itself only has dirty timestamps (its pages + * may still be dirty) and isn't currently being allocated or freed. + * Filesystems should call this if when writing an inode when lazytime is + * enabled, they want to opportunistically write the timestamps of other inodes + * located very nearby on-disk, e.g. in the same inode block. This returns true + * if the given inode is in need of such an opportunistic update. Requires + * i_lock, or at least later re-checking under i_lock. + */ +static inline bool inode_is_dirtytime_only(struct inode *inode) +{ + return (inode->i_state & (I_DIRTY_TIME | I_NEW | + I_FREEING | I_WILL_FREE)) == I_DIRTY_TIME; +} + extern void inc_nlink(struct inode *inode); extern void drop_nlink(struct inode *inode); extern void clear_nlink(struct inode *inode);