diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 89e1bcb21341..b84aa1ca480a 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2527,8 +2527,8 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); -int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create); +int ext4_dax_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); int ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); int ext4_dio_get_block(struct inode *inode, sector_t iblock, @@ -3334,6 +3334,13 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) } } +static inline bool ext4_aligned_io(struct inode *inode, loff_t off, loff_t len) +{ + int blksize = 1 << inode->i_blkbits; + + return IS_ALIGNED(off, blksize) && IS_ALIGNED(len, blksize); +} + #endif /* __KERNEL__ */ #define EFSBADCRC EBADMSG /* Bad CRC detected */ diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 3e850b988923..37e28082885a 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -207,7 +207,7 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) if (IS_ERR(handle)) result = VM_FAULT_SIGBUS; else - result = __dax_fault(vma, vmf, ext4_dax_mmap_get_block, NULL); + result = __dax_fault(vma, vmf, ext4_dax_get_block, NULL); if (write) { if (!IS_ERR(handle)) @@ -243,7 +243,7 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, result = VM_FAULT_SIGBUS; else result = __dax_pmd_fault(vma, addr, pmd, flags, - ext4_dax_mmap_get_block, NULL); + ext4_dax_get_block, NULL); if (write) { if (!IS_ERR(handle)) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 4879e93c91d3..f9ab1e8cc416 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3229,13 +3229,17 @@ static int ext4_releasepage(struct page *page, gfp_t wait) } #ifdef CONFIG_FS_DAX -int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) +/* + * Get block function for DAX IO and mmap faults. It takes care of converting + * unwritten extents to written ones and initializes new / converted blocks + * to zeros. + */ +int ext4_dax_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) { int ret; - ext4_debug("ext4_dax_mmap_get_block: inode %lu, create flag %d\n", - inode->i_ino, create); + ext4_debug("inode %lu, create flag %d\n", inode->i_ino, create); if (!create) return _ext4_get_block(inode, iblock, bh_result, 0); @@ -3247,9 +3251,9 @@ int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock, if (buffer_unwritten(bh_result)) { /* - * We are protected by i_mmap_sem so we know block cannot go - * away from under us even though we dropped i_data_sem. - * Convert extent to written and write zeros there. + * We are protected by i_mmap_sem or i_mutex so we know block + * cannot go away from under us even though we dropped + * i_data_sem. Convert extent to written and write zeros there. */ ret = ext4_get_block_trans(inode, iblock, bh_result, EXT4_GET_BLOCKS_CONVERT | @@ -3264,6 +3268,14 @@ int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock, clear_buffer_new(bh_result); return 0; } +#else +/* Just define empty function, it will never get called. */ +int ext4_dax_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + BUG(); + return 0; +} #endif static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset, @@ -3385,8 +3397,20 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter, iocb->private = NULL; if (overwrite) get_block_func = ext4_dio_get_block_overwrite; - else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) || - round_down(offset, 1 << inode->i_blkbits) >= inode->i_size) { + else if (IS_DAX(inode)) { + /* + * We can avoid zeroing for aligned DAX writes beyond EOF. Other + * writes need zeroing either because they can race with page + * faults or because they use partial blocks. + */ + if (round_down(offset, 1<i_blkbits) >= inode->i_size && + ext4_aligned_io(inode, offset, count)) + get_block_func = ext4_dio_get_block; + else + get_block_func = ext4_dax_get_block; + dio_flags = DIO_LOCKING; + } else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) || + round_down(offset, 1 << inode->i_blkbits) >= inode->i_size) { get_block_func = ext4_dio_get_block; dio_flags = DIO_LOCKING | DIO_SKIP_HOLES; } else if (is_sync_kiocb(iocb)) { @@ -3400,7 +3424,6 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter, BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)); #endif if (IS_DAX(inode)) { - dio_flags &= ~DIO_SKIP_HOLES; ret = dax_do_io(iocb, inode, iter, offset, get_block_func, ext4_end_io_dio, dio_flags); } else