From 8f3e2058e1746dc3fb8145f8fbd5ee358cbc1a30 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Jul 2016 11:29:35 +1000 Subject: [PATCH 1/7] xfs: don't pass ioflags around in the ioctl path Instead check the file pointer for the invisble I/O flag directly, and use the chance to drop redundant arguments from the xfs_ioc_space prototype. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_ioctl.c | 22 ++++++++-------------- fs/xfs/xfs_ioctl.h | 3 --- fs/xfs/xfs_ioctl32.c | 6 +----- 3 files changed, 9 insertions(+), 22 deletions(-) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index dbca7375deef..6ab5a247075d 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -595,13 +595,12 @@ xfs_attrmulti_by_handle( int xfs_ioc_space( - struct xfs_inode *ip, - struct inode *inode, struct file *filp, - int ioflags, unsigned int cmd, xfs_flock64_t *bf) { + struct inode *inode = file_inode(filp); + struct xfs_inode *ip = XFS_I(inode); struct iattr iattr; enum xfs_prealloc_flags flags = 0; uint iolock = XFS_IOLOCK_EXCL; @@ -626,7 +625,7 @@ xfs_ioc_space( if (filp->f_flags & O_DSYNC) flags |= XFS_PREALLOC_SYNC; - if (ioflags & XFS_IO_INVIS) + if (filp->f_mode & FMODE_NOCMTIME) flags |= XFS_PREALLOC_INVISIBLE; error = mnt_want_write_file(filp); @@ -1464,8 +1463,7 @@ xfs_getbmap_format(void **ap, struct getbmapx *bmv, int *full) STATIC int xfs_ioc_getbmap( - struct xfs_inode *ip, - int ioflags, + struct file *file, unsigned int cmd, void __user *arg) { @@ -1479,10 +1477,10 @@ xfs_ioc_getbmap( return -EINVAL; bmx.bmv_iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0); - if (ioflags & XFS_IO_INVIS) + if (file->f_mode & FMODE_NOCMTIME) bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ; - error = xfs_getbmap(ip, &bmx, xfs_getbmap_format, + error = xfs_getbmap(XFS_I(file_inode(file)), &bmx, xfs_getbmap_format, (__force struct getbmap *)arg+1); if (error) return error; @@ -1619,12 +1617,8 @@ xfs_file_ioctl( struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; void __user *arg = (void __user *)p; - int ioflags = 0; int error; - if (filp->f_mode & FMODE_NOCMTIME) - ioflags |= XFS_IO_INVIS; - trace_xfs_file_ioctl(ip); switch (cmd) { @@ -1643,7 +1637,7 @@ xfs_file_ioctl( if (copy_from_user(&bf, arg, sizeof(bf))) return -EFAULT; - return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf); + return xfs_ioc_space(filp, cmd, &bf); } case XFS_IOC_DIOINFO: { struct dioattr da; @@ -1702,7 +1696,7 @@ xfs_file_ioctl( case XFS_IOC_GETBMAP: case XFS_IOC_GETBMAPA: - return xfs_ioc_getbmap(ip, ioflags, cmd, arg); + return xfs_ioc_getbmap(filp, cmd, arg); case XFS_IOC_GETBMAPX: return xfs_ioc_getbmapx(ip, arg); diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h index 77c02c7900b6..8b52881bfd90 100644 --- a/fs/xfs/xfs_ioctl.h +++ b/fs/xfs/xfs_ioctl.h @@ -20,10 +20,7 @@ extern int xfs_ioc_space( - struct xfs_inode *ip, - struct inode *inode, struct file *filp, - int ioflags, unsigned int cmd, xfs_flock64_t *bf); diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index 1a05d8ae327d..321f57721b92 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -532,12 +532,8 @@ xfs_file_compat_ioctl( struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; void __user *arg = (void __user *)p; - int ioflags = 0; int error; - if (filp->f_mode & FMODE_NOCMTIME) - ioflags |= XFS_IO_INVIS; - trace_xfs_file_compat_ioctl(ip); switch (cmd) { @@ -589,7 +585,7 @@ xfs_file_compat_ioctl( if (xfs_compat_flock64_copyin(&bf, arg)) return -EFAULT; cmd = _NATIVE_IOC(cmd, struct xfs_flock64); - return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf); + return xfs_ioc_space(filp, cmd, &bf); } case XFS_IOC_FSGEOMETRY_V1_32: return xfs_compat_ioc_fsgeometry_v1(mp, arg); From 3176c3e0ef32963aa5f6f9754142e420a4ba5d64 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Jul 2016 11:31:42 +1000 Subject: [PATCH 2/7] xfs: kill ioflags Now that we have the direct I/O kiocb flag there is no real need to sample the value inside of XFS, and the invis flag was always just partially used and isn't worth keeping this infrastructure around for. This also splits the read tracepoint into buffered vs direct as we've done for writes a long time ago. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_file.c | 26 +++++++++----------------- fs/xfs/xfs_inode.h | 10 ---------- fs/xfs/xfs_trace.h | 19 ++++++++----------- 3 files changed, 17 insertions(+), 38 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 47fc63295422..e51622c8e482 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -292,18 +292,12 @@ xfs_file_read_iter( struct xfs_mount *mp = ip->i_mount; size_t size = iov_iter_count(to); ssize_t ret = 0; - int ioflags = 0; xfs_fsize_t n; loff_t pos = iocb->ki_pos; XFS_STATS_INC(mp, xs_read_calls); - if (unlikely(iocb->ki_flags & IOCB_DIRECT)) - ioflags |= XFS_IO_ISDIRECT; - if (file->f_mode & FMODE_NOCMTIME) - ioflags |= XFS_IO_INVIS; - - if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) { + if ((iocb->ki_flags & IOCB_DIRECT) && !IS_DAX(inode)) { xfs_buftarg_t *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; @@ -336,7 +330,7 @@ xfs_file_read_iter( * serialisation. */ xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); - if ((ioflags & XFS_IO_ISDIRECT) && inode->i_mapping->nrpages) { + if ((iocb->ki_flags & IOCB_DIRECT) && inode->i_mapping->nrpages) { xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); @@ -370,7 +364,10 @@ xfs_file_read_iter( xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); } - trace_xfs_file_read(ip, size, pos, ioflags); + if (iocb->ki_flags & IOCB_DIRECT) + trace_xfs_file_direct_read(ip, size, pos); + else + trace_xfs_file_buffered_read(ip, size, pos); ret = generic_file_read_iter(iocb, to); if (ret > 0) @@ -389,18 +386,14 @@ xfs_file_splice_read( unsigned int flags) { struct xfs_inode *ip = XFS_I(infilp->f_mapping->host); - int ioflags = 0; ssize_t ret; XFS_STATS_INC(ip->i_mount, xs_read_calls); - if (infilp->f_mode & FMODE_NOCMTIME) - ioflags |= XFS_IO_INVIS; - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; - trace_xfs_file_splice_read(ip, count, *ppos, ioflags); + trace_xfs_file_splice_read(ip, count, *ppos); /* * DAX inodes cannot ues the page cache for splice, so we have to push @@ -789,7 +782,7 @@ xfs_file_dio_aio_write( iolock = XFS_IOLOCK_SHARED; } - trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); + trace_xfs_file_direct_write(ip, count, iocb->ki_pos); data = *from; ret = mapping->a_ops->direct_IO(iocb, &data); @@ -839,8 +832,7 @@ xfs_file_buffered_aio_write( current->backing_dev_info = inode_to_bdi(inode); write_retry: - trace_xfs_file_buffered_write(ip, iov_iter_count(from), - iocb->ki_pos, 0); + trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos); ret = generic_perform_write(file, from, iocb->ki_pos); if (likely(ret >= 0)) iocb->ki_pos += ret; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index e52d7c7aeb5b..57b66d2bbc45 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -479,14 +479,4 @@ do { \ extern struct kmem_zone *xfs_inode_zone; -/* - * Flags for read/write calls - */ -#define XFS_IO_ISDIRECT 0x00001 /* bypass page cache */ -#define XFS_IO_INVIS 0x00002 /* don't update inode timestamps */ - -#define XFS_IO_FLAGS \ - { XFS_IO_ISDIRECT, "DIRECT" }, \ - { XFS_IO_INVIS, "INVIS"} - #endif /* __XFS_INODE_H__ */ diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index ea94ee0fe5ea..a1bc5c64a573 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1134,15 +1134,14 @@ TRACE_EVENT(xfs_log_assign_tail_lsn, ) DECLARE_EVENT_CLASS(xfs_file_class, - TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), - TP_ARGS(ip, count, offset, flags), + TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset), + TP_ARGS(ip, count, offset), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) __field(xfs_fsize_t, size) __field(loff_t, offset) __field(size_t, count) - __field(int, flags) ), TP_fast_assign( __entry->dev = VFS_I(ip)->i_sb->s_dev; @@ -1150,23 +1149,21 @@ DECLARE_EVENT_CLASS(xfs_file_class, __entry->size = ip->i_d.di_size; __entry->offset = offset; __entry->count = count; - __entry->flags = flags; ), - TP_printk("dev %d:%d ino 0x%llx size 0x%llx " - "offset 0x%llx count 0x%zx ioflags %s", + TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count 0x%zx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->size, __entry->offset, - __entry->count, - __print_flags(__entry->flags, "|", XFS_IO_FLAGS)) + __entry->count) ) #define DEFINE_RW_EVENT(name) \ DEFINE_EVENT(xfs_file_class, name, \ - TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), \ - TP_ARGS(ip, count, offset, flags)) -DEFINE_RW_EVENT(xfs_file_read); + TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset), \ + TP_ARGS(ip, count, offset)) +DEFINE_RW_EVENT(xfs_file_buffered_read); +DEFINE_RW_EVENT(xfs_file_direct_read); DEFINE_RW_EVENT(xfs_file_buffered_write); DEFINE_RW_EVENT(xfs_file_direct_write); DEFINE_RW_EVENT(xfs_file_splice_read); From cf810712cc82cbfab8f08a46ca6c0289d386a303 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Jul 2016 11:31:53 +1000 Subject: [PATCH 3/7] xfs: remove s_maxbytes enforcement in xfs_file_read_iter All the three low-level read implementations that we might call already take care of not overflowing the maximum supported bytes, no need to duplicate it here. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_file.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index e51622c8e482..7ec8225b7fd2 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -292,7 +292,6 @@ xfs_file_read_iter( struct xfs_mount *mp = ip->i_mount; size_t size = iov_iter_count(to); ssize_t ret = 0; - xfs_fsize_t n; loff_t pos = iocb->ki_pos; XFS_STATS_INC(mp, xs_read_calls); @@ -309,13 +308,6 @@ xfs_file_read_iter( } } - n = mp->m_super->s_maxbytes - pos; - if (n <= 0 || size == 0) - return 0; - - if (n < size) - size = n; - if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; From bbc5a740c4f27a9732a3a3decf3186b4bce21108 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Jul 2016 11:35:42 +1000 Subject: [PATCH 4/7] xfs: split xfs_file_read_iter into buffered and direct I/O helpers Similar to what we did on the write side a while ago. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_file.c | 87 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 59 insertions(+), 28 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 7ec8225b7fd2..fdb123ffd616 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -282,35 +282,33 @@ xfs_file_fsync( } STATIC ssize_t -xfs_file_read_iter( +xfs_file_dio_aio_read( struct kiocb *iocb, struct iov_iter *to) { - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; + struct address_space *mapping = iocb->ki_filp->f_mapping; + struct inode *inode = mapping->host; struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - size_t size = iov_iter_count(to); + size_t count = iov_iter_count(to); + struct xfs_buftarg *target; ssize_t ret = 0; - loff_t pos = iocb->ki_pos; - XFS_STATS_INC(mp, xs_read_calls); + trace_xfs_file_direct_read(ip, count, iocb->ki_pos); - if ((iocb->ki_flags & IOCB_DIRECT) && !IS_DAX(inode)) { - xfs_buftarg_t *target = - XFS_IS_REALTIME_INODE(ip) ? - mp->m_rtdev_targp : mp->m_ddev_targp; + if (XFS_IS_REALTIME_INODE(ip)) + target = ip->i_mount->m_rtdev_targp; + else + target = ip->i_mount->m_ddev_targp; + + if (!IS_DAX(inode)) { /* DIO must be aligned to device logical sector size */ - if ((pos | size) & target->bt_logical_sectormask) { - if (pos == i_size_read(inode)) + if ((iocb->ki_pos | count) & target->bt_logical_sectormask) { + if (iocb->ki_pos == i_size_read(inode)) return 0; return -EINVAL; } } - if (XFS_FORCED_SHUTDOWN(mp)) - return -EIO; - /* * Locking is a bit tricky here. If we take an exclusive lock for direct * IO, we effectively serialise all new concurrent read IO to this file @@ -322,7 +320,7 @@ xfs_file_read_iter( * serialisation. */ xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); - if ((iocb->ki_flags & IOCB_DIRECT) && inode->i_mapping->nrpages) { + if (mapping->nrpages) { xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); @@ -337,8 +335,8 @@ xfs_file_read_iter( * flush and reduce the chances of repeated iolock cycles going * forward. */ - if (inode->i_mapping->nrpages) { - ret = filemap_write_and_wait(VFS_I(ip)->i_mapping); + if (mapping->nrpages) { + ret = filemap_write_and_wait(mapping); if (ret) { xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); return ret; @@ -349,23 +347,56 @@ xfs_file_read_iter( * we fail to invalidate a page, but this should never * happen on XFS. Warn if it does fail. */ - ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping); + ret = invalidate_inode_pages2(mapping); WARN_ON_ONCE(ret); ret = 0; } xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); } - if (iocb->ki_flags & IOCB_DIRECT) - trace_xfs_file_direct_read(ip, size, pos); - else - trace_xfs_file_buffered_read(ip, size, pos); - ret = generic_file_read_iter(iocb, to); + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + + return ret; +} + +STATIC ssize_t +xfs_file_buffered_aio_read( + struct kiocb *iocb, + struct iov_iter *to) +{ + struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); + ssize_t ret; + + trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos); + + xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); + ret = generic_file_read_iter(iocb, to); + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + + return ret; +} + +STATIC ssize_t +xfs_file_read_iter( + struct kiocb *iocb, + struct iov_iter *to) +{ + struct xfs_mount *mp = XFS_I(file_inode(iocb->ki_filp))->i_mount; + ssize_t ret = 0; + + XFS_STATS_INC(mp, xs_read_calls); + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + if (iocb->ki_flags & IOCB_DIRECT) + ret = xfs_file_dio_aio_read(iocb, to); + else + ret = xfs_file_buffered_aio_read(iocb, to); + if (ret > 0) XFS_STATS_ADD(mp, xs_read_bytes, ret); - - xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); return ret; } @@ -747,7 +778,7 @@ xfs_file_dio_aio_write( end = iocb->ki_pos + count - 1; /* - * See xfs_file_read_iter() for why we do a full-file flush here. + * See xfs_file_dio_aio_read() for why we do a full-file flush here. */ if (mapping->nrpages) { ret = filemap_write_and_wait(VFS_I(ip)->i_mapping); From f1285ff0acf9040a39921355d07bd83a3308c402 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Jul 2016 11:36:57 +1000 Subject: [PATCH 5/7] xfs: stop using generic_file_read_iter for direct I/O XFS already implement it's own flushing of the pagecache because it implements proper synchronization for direct I/O reads. This means calling generic_file_read_iter for direct I/O is rather useless, as it doesn't do much but updating the atime and iocb position for us. This also gets rid of the buffered I/O fallback that isn't used for XFS. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_file.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index fdb123ffd616..440bb8b5c64d 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -289,12 +289,17 @@ xfs_file_dio_aio_read( struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = mapping->host; struct xfs_inode *ip = XFS_I(inode); + loff_t isize = i_size_read(inode); size_t count = iov_iter_count(to); + struct iov_iter data; struct xfs_buftarg *target; ssize_t ret = 0; trace_xfs_file_direct_read(ip, count, iocb->ki_pos); + if (!count) + return 0; /* skip atime */ + if (XFS_IS_REALTIME_INODE(ip)) target = ip->i_mount->m_rtdev_targp; else @@ -303,7 +308,7 @@ xfs_file_dio_aio_read( if (!IS_DAX(inode)) { /* DIO must be aligned to device logical sector size */ if ((iocb->ki_pos | count) & target->bt_logical_sectormask) { - if (iocb->ki_pos == i_size_read(inode)) + if (iocb->ki_pos == isize) return 0; return -EINVAL; } @@ -354,9 +359,15 @@ xfs_file_dio_aio_read( xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); } - ret = generic_file_read_iter(iocb, to); + data = *to; + ret = mapping->a_ops->direct_IO(iocb, &data); + if (ret > 0) { + iocb->ki_pos += ret; + iov_iter_advance(to, ret); + } xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + file_accessed(iocb->ki_filp); return ret; } From fa8d972d055c723cc427e14d4d7919640f418730 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Jul 2016 11:38:01 +1000 Subject: [PATCH 6/7] xfs: direct calls in the direct I/O path We control both the callers and callees of ->direct_IO, so remove the indirect calls. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_aops.c | 24 +++++------------------- fs/xfs/xfs_aops.h | 3 +++ fs/xfs/xfs_file.c | 17 +++++++++++++++-- 3 files changed, 23 insertions(+), 21 deletions(-) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 4c463b99fe57..3ba0809e0be8 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1336,7 +1336,7 @@ xfs_get_blocks_dax_fault( * whereas if we have flags set we will always be called in task context * (i.e. from a workqueue). */ -STATIC int +int xfs_end_io_direct_write( struct kiocb *iocb, loff_t offset, @@ -1407,24 +1407,10 @@ xfs_vm_direct_IO( struct kiocb *iocb, struct iov_iter *iter) { - struct inode *inode = iocb->ki_filp->f_mapping->host; - dio_iodone_t *endio = NULL; - int flags = 0; - struct block_device *bdev; - - if (iov_iter_rw(iter) == WRITE) { - endio = xfs_end_io_direct_write; - flags = DIO_ASYNC_EXTEND; - } - - if (IS_DAX(inode)) { - return dax_do_io(iocb, inode, iter, - xfs_get_blocks_direct, endio, 0); - } - - bdev = xfs_find_bdev_for_inode(inode); - return __blockdev_direct_IO(iocb, inode, bdev, iter, - xfs_get_blocks_direct, endio, NULL, flags); + /* + * We just need the method present so that open/fcntl allow direct I/O. + */ + return -EINVAL; } /* diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index 814aab790713..bf2d9a141a73 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -60,6 +60,9 @@ int xfs_get_blocks_direct(struct inode *inode, sector_t offset, int xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset, struct buffer_head *map_bh, int create); +int xfs_end_io_direct_write(struct kiocb *iocb, loff_t offset, + ssize_t size, void *private); + extern void xfs_count_page_state(struct page *, int *, int *); extern struct block_device *xfs_find_bdev_for_inode(struct inode *); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 440bb8b5c64d..dd5185dafc9f 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -360,7 +360,13 @@ xfs_file_dio_aio_read( } data = *to; - ret = mapping->a_ops->direct_IO(iocb, &data); + if (IS_DAX(inode)) { + ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, + NULL, 0); + } else { + ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, + xfs_get_blocks_direct, NULL, NULL, 0); + } if (ret > 0) { iocb->ki_pos += ret; iov_iter_advance(to, ret); @@ -819,7 +825,14 @@ xfs_file_dio_aio_write( trace_xfs_file_direct_write(ip, count, iocb->ki_pos); data = *from; - ret = mapping->a_ops->direct_IO(iocb, &data); + if (IS_DAX(inode)) { + ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, + xfs_end_io_direct_write, 0); + } else { + ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, + xfs_get_blocks_direct, xfs_end_io_direct_write, + NULL, DIO_ASYNC_EXTEND); + } /* see generic_file_direct_write() for why this is necessary */ if (mapping->nrpages) { From 16d4d43595b4780daac8fcea6d042689124cb094 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Jul 2016 11:38:55 +1000 Subject: [PATCH 7/7] xfs: split direct I/O and DAX path So far the DAX code overloaded the direct I/O code path. There is very little in common between the two, and untangling them allows to clean up both variants. As a side effect we also get separate trace points for both I/O types. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_file.c | 139 +++++++++++++++++++++++++++++++++++---------- fs/xfs/xfs_trace.h | 2 + 2 files changed, 112 insertions(+), 29 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index dd5185dafc9f..d97e8cb99a59 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -305,13 +305,11 @@ xfs_file_dio_aio_read( else target = ip->i_mount->m_ddev_targp; - if (!IS_DAX(inode)) { - /* DIO must be aligned to device logical sector size */ - if ((iocb->ki_pos | count) & target->bt_logical_sectormask) { - if (iocb->ki_pos == isize) - return 0; - return -EINVAL; - } + /* DIO must be aligned to device logical sector size */ + if ((iocb->ki_pos | count) & target->bt_logical_sectormask) { + if (iocb->ki_pos == isize) + return 0; + return -EINVAL; } /* @@ -360,13 +358,37 @@ xfs_file_dio_aio_read( } data = *to; - if (IS_DAX(inode)) { - ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, - NULL, 0); - } else { - ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, - xfs_get_blocks_direct, NULL, NULL, 0); + ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, + xfs_get_blocks_direct, NULL, NULL, 0); + if (ret > 0) { + iocb->ki_pos += ret; + iov_iter_advance(to, ret); } + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + + file_accessed(iocb->ki_filp); + return ret; +} + +STATIC ssize_t +xfs_file_dax_read( + struct kiocb *iocb, + struct iov_iter *to) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + struct inode *inode = mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct iov_iter data = *to; + size_t count = iov_iter_count(to); + ssize_t ret = 0; + + trace_xfs_file_dax_read(ip, count, iocb->ki_pos); + + if (!count) + return 0; /* skip atime */ + + xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); + ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, NULL, 0); if (ret > 0) { iocb->ki_pos += ret; iov_iter_advance(to, ret); @@ -399,7 +421,8 @@ xfs_file_read_iter( struct kiocb *iocb, struct iov_iter *to) { - struct xfs_mount *mp = XFS_I(file_inode(iocb->ki_filp))->i_mount; + struct inode *inode = file_inode(iocb->ki_filp); + struct xfs_mount *mp = XFS_I(inode)->i_mount; ssize_t ret = 0; XFS_STATS_INC(mp, xs_read_calls); @@ -407,7 +430,9 @@ xfs_file_read_iter( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - if (iocb->ki_flags & IOCB_DIRECT) + if (IS_DAX(inode)) + ret = xfs_file_dax_read(iocb, to); + else if (iocb->ki_flags & IOCB_DIRECT) ret = xfs_file_dio_aio_read(iocb, to); else ret = xfs_file_buffered_aio_read(iocb, to); @@ -755,8 +780,7 @@ xfs_file_dio_aio_write( mp->m_rtdev_targp : mp->m_ddev_targp; /* DIO must be aligned to device logical sector size */ - if (!IS_DAX(inode) && - ((iocb->ki_pos | count) & target->bt_logical_sectormask)) + if ((iocb->ki_pos | count) & target->bt_logical_sectormask) return -EINVAL; /* "unaligned" here means not aligned to a filesystem block */ @@ -825,14 +849,9 @@ xfs_file_dio_aio_write( trace_xfs_file_direct_write(ip, count, iocb->ki_pos); data = *from; - if (IS_DAX(inode)) { - ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, - xfs_end_io_direct_write, 0); - } else { - ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, - xfs_get_blocks_direct, xfs_end_io_direct_write, - NULL, DIO_ASYNC_EXTEND); - } + ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, + xfs_get_blocks_direct, xfs_end_io_direct_write, + NULL, DIO_ASYNC_EXTEND); /* see generic_file_direct_write() for why this is necessary */ if (mapping->nrpages) { @@ -849,10 +868,70 @@ out: xfs_rw_iunlock(ip, iolock); /* - * No fallback to buffered IO on errors for XFS. DAX can result in - * partial writes, but direct IO will either complete fully or fail. + * No fallback to buffered IO on errors for XFS, direct IO will either + * complete fully or fail. */ - ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip))); + ASSERT(ret < 0 || ret == count); + return ret; +} + +STATIC ssize_t +xfs_file_dax_write( + struct kiocb *iocb, + struct iov_iter *from) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + struct inode *inode = mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + ssize_t ret = 0; + int unaligned_io = 0; + int iolock; + struct iov_iter data; + + /* "unaligned" here means not aligned to a filesystem block */ + if ((iocb->ki_pos & mp->m_blockmask) || + ((iocb->ki_pos + iov_iter_count(from)) & mp->m_blockmask)) { + unaligned_io = 1; + iolock = XFS_IOLOCK_EXCL; + } else if (mapping->nrpages) { + iolock = XFS_IOLOCK_EXCL; + } else { + iolock = XFS_IOLOCK_SHARED; + } + xfs_rw_ilock(ip, iolock); + + ret = xfs_file_aio_write_checks(iocb, from, &iolock); + if (ret) + goto out; + + /* + * Yes, even DAX files can have page cache attached to them: A zeroed + * page is inserted into the pagecache when we have to serve a write + * fault on a hole. It should never be dirtied and can simply be + * dropped from the pagecache once we get real data for the page. + */ + if (mapping->nrpages) { + ret = invalidate_inode_pages2(mapping); + WARN_ON_ONCE(ret); + } + + if (iolock == XFS_IOLOCK_EXCL && !unaligned_io) { + xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); + iolock = XFS_IOLOCK_SHARED; + } + + trace_xfs_file_dax_write(ip, iov_iter_count(from), iocb->ki_pos); + + data = *from; + ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, + xfs_end_io_direct_write, 0); + if (ret > 0) { + iocb->ki_pos += ret; + iov_iter_advance(from, ret); + } +out: + xfs_rw_iunlock(ip, iolock); return ret; } @@ -934,7 +1013,9 @@ xfs_file_write_iter( if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; - if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode)) + if (IS_DAX(inode)) + ret = xfs_file_dax_write(iocb, from); + else if (iocb->ki_flags & IOCB_DIRECT) ret = xfs_file_dio_aio_write(iocb, from); else ret = xfs_file_buffered_aio_write(iocb, from); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index a1bc5c64a573..c2876917dd89 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1164,8 +1164,10 @@ DEFINE_EVENT(xfs_file_class, name, \ TP_ARGS(ip, count, offset)) DEFINE_RW_EVENT(xfs_file_buffered_read); DEFINE_RW_EVENT(xfs_file_direct_read); +DEFINE_RW_EVENT(xfs_file_dax_read); DEFINE_RW_EVENT(xfs_file_buffered_write); DEFINE_RW_EVENT(xfs_file_direct_write); +DEFINE_RW_EVENT(xfs_file_dax_write); DEFINE_RW_EVENT(xfs_file_splice_read); DECLARE_EVENT_CLASS(xfs_page_class,