diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 188cbbd5b74a..4663f7dbff1c 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -1295,6 +1295,11 @@ _pagebuf_ioapply( rw = (pb->pb_flags & PBF_READ) ? READ : WRITE; } + if (pb->pb_flags & PBF_ORDERED) { + ASSERT(!(pb->pb_flags & PBF_READ)); + rw = WRITE_BARRIER; + } + /* Special code path for reading a sub page size pagebuf in -- * we populate up the whole page, and hence the other metadata * in the same page. This optimization is only valid when the diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index 39c8ca122534..fa21d1f9cb0b 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -74,7 +74,7 @@ typedef enum page_buf_flags_e { /* pb_flags values */ PBF_DELWRI = (1 << 6), /* buffer has dirty pages */ PBF_STALE = (1 << 7), /* buffer has been staled, do not find it */ PBF_FS_MANAGED = (1 << 8), /* filesystem controls freeing memory */ - PBF_FLUSH = (1 << 11), /* flush disk write cache */ + PBF_ORDERED = (1 << 11), /* use ordered writes */ PBF_READ_AHEAD = (1 << 12), /* asynchronous read-ahead */ /* flags used only as arguments to access routines */ @@ -383,9 +383,9 @@ extern void pagebuf_trace( #define XFS_BUF_UNASYNC(x) ((x)->pb_flags &= ~PBF_ASYNC) #define XFS_BUF_ISASYNC(x) ((x)->pb_flags & PBF_ASYNC) -#define XFS_BUF_FLUSH(x) ((x)->pb_flags |= PBF_FLUSH) -#define XFS_BUF_UNFLUSH(x) ((x)->pb_flags &= ~PBF_FLUSH) -#define XFS_BUF_ISFLUSH(x) ((x)->pb_flags & PBF_FLUSH) +#define XFS_BUF_ORDERED(x) ((x)->pb_flags |= PBF_ORDERED) +#define XFS_BUF_UNORDERED(x) ((x)->pb_flags &= ~PBF_ORDERED) +#define XFS_BUF_ISORDERED(x) ((x)->pb_flags & PBF_ORDERED) #define XFS_BUF_SHUT(x) printk("XFS_BUF_SHUT not implemented yet\n") #define XFS_BUF_UNSHUT(x) printk("XFS_BUF_UNSHUT not implemented yet\n") diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 2302454d8d47..d2701cc624b9 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -278,6 +278,72 @@ xfs_blkdev_put( close_bdev_excl(bdev); } +/* + * Try to write out the superblock using barriers. + */ +STATIC int +xfs_barrier_test( + xfs_mount_t *mp) +{ + xfs_buf_t *sbp = xfs_getsb(mp, 0); + int error; + + XFS_BUF_UNDONE(sbp); + XFS_BUF_UNREAD(sbp); + XFS_BUF_UNDELAYWRITE(sbp); + XFS_BUF_WRITE(sbp); + XFS_BUF_UNASYNC(sbp); + XFS_BUF_ORDERED(sbp); + + xfsbdstrat(mp, sbp); + error = xfs_iowait(sbp); + + /* + * Clear all the flags we set and possible error state in the + * buffer. We only did the write to try out whether barriers + * worked and shouldn't leave any traces in the superblock + * buffer. + */ + XFS_BUF_DONE(sbp); + XFS_BUF_ERROR(sbp, 0); + XFS_BUF_UNORDERED(sbp); + + xfs_buf_relse(sbp); + return error; +} + +void +xfs_mountfs_check_barriers(xfs_mount_t *mp) +{ + int error; + + if (mp->m_logdev_targp != mp->m_ddev_targp) { + xfs_fs_cmn_err(CE_NOTE, mp, + "Disabling barriers, not supported with external log device"); + mp->m_flags &= ~XFS_MOUNT_BARRIER; + } + + if (mp->m_ddev_targp->pbr_bdev->bd_disk->queue->ordered == + QUEUE_ORDERED_NONE) { + xfs_fs_cmn_err(CE_NOTE, mp, + "Disabling barriers, not supported by the underlying device"); + mp->m_flags &= ~XFS_MOUNT_BARRIER; + } + + error = xfs_barrier_test(mp); + if (error) { + xfs_fs_cmn_err(CE_NOTE, mp, + "Disabling barriers, trial barrier write failed"); + mp->m_flags &= ~XFS_MOUNT_BARRIER; + } +} + +void +xfs_blkdev_issue_flush( + xfs_buftarg_t *buftarg) +{ + blkdev_issue_flush(buftarg->pbr_bdev, NULL); +} STATIC struct inode * linvfs_alloc_inode( diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h index ec7e0035c731..ad77e3743e04 100644 --- a/fs/xfs/linux-2.6/xfs_super.h +++ b/fs/xfs/linux-2.6/xfs_super.h @@ -132,6 +132,7 @@ extern void xfs_flush_device(struct xfs_inode *); extern int xfs_blkdev_get(struct xfs_mount *, const char *, struct block_device **); extern void xfs_blkdev_put(struct block_device *); +extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); extern struct export_operations linvfs_export_ops; diff --git a/fs/xfs/xfs_clnt.h b/fs/xfs/xfs_clnt.h index b3215ffe0be8..c93cb282f3d8 100644 --- a/fs/xfs/xfs_clnt.h +++ b/fs/xfs/xfs_clnt.h @@ -99,7 +99,7 @@ struct xfs_mount_args { * enforcement */ #define XFSMNT_NOUUID 0x01000000 /* Ignore fs uuid */ #define XFSMNT_DMAPI 0x02000000 /* enable dmapi/xdsm */ -#define XFSMNT_NOLOGFLUSH 0x04000000 /* Don't flush for log blocks */ +#define XFSMNT_BARRIER 0x04000000 /* use write barriers */ #define XFSMNT_IDELETE 0x08000000 /* inode cluster delete */ #define XFSMNT_SWALLOC 0x10000000 /* turn on stripe width * allocation */ diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 51814c32eddf..b9d3ad35240e 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -93,8 +93,11 @@ STATIC int xlog_state_release_iclog(xlog_t *log, STATIC void xlog_state_switch_iclogs(xlog_t *log, xlog_in_core_t *iclog, int eventual_size); -STATIC int xlog_state_sync(xlog_t *log, xfs_lsn_t lsn, uint flags); -STATIC int xlog_state_sync_all(xlog_t *log, uint flags); +STATIC int xlog_state_sync(xlog_t *log, + xfs_lsn_t lsn, + uint flags, + int *log_flushed); +STATIC int xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed); STATIC void xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog); /* local functions to manipulate grant head */ @@ -312,12 +315,17 @@ xfs_log_done(xfs_mount_t *mp, * semaphore. */ int -xfs_log_force(xfs_mount_t *mp, - xfs_lsn_t lsn, - uint flags) +_xfs_log_force( + xfs_mount_t *mp, + xfs_lsn_t lsn, + uint flags, + int *log_flushed) { - int rval; - xlog_t *log = mp->m_log; + xlog_t *log = mp->m_log; + int dummy; + + if (!log_flushed) + log_flushed = &dummy; #if defined(DEBUG) || defined(XLOG_NOLOG) if (!xlog_debug && xlog_target == log->l_targ) @@ -328,17 +336,12 @@ xfs_log_force(xfs_mount_t *mp, XFS_STATS_INC(xs_log_force); - if ((log->l_flags & XLOG_IO_ERROR) == 0) { - if (lsn == 0) - rval = xlog_state_sync_all(log, flags); - else - rval = xlog_state_sync(log, lsn, flags); - } else { - rval = XFS_ERROR(EIO); - } - - return rval; - + if (log->l_flags & XLOG_IO_ERROR) + return XFS_ERROR(EIO); + if (lsn == 0) + return xlog_state_sync_all(log, flags, log_flushed); + else + return xlog_state_sync(log, lsn, flags, log_flushed); } /* xfs_log_force */ /* @@ -1467,14 +1470,13 @@ xlog_sync(xlog_t *log, XFS_BUF_BUSY(bp); XFS_BUF_ASYNC(bp); /* - * Do a disk write cache flush for the log block. - * This is a bit of a sledgehammer, it would be better - * to use a tag barrier here that just prevents reordering. + * Do an ordered write for the log block. + * * It may not be needed to flush the first split block in the log wrap * case, but do it anyways to be safe -AK */ - if (!(log->l_mp->m_flags & XFS_MOUNT_NOLOGFLUSH)) - XFS_BUF_FLUSH(bp); + if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) + XFS_BUF_ORDERED(bp); ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize); @@ -1505,8 +1507,8 @@ xlog_sync(xlog_t *log, XFS_BUF_SET_FSPRIVATE(bp, iclog); XFS_BUF_BUSY(bp); XFS_BUF_ASYNC(bp); - if (!(log->l_mp->m_flags & XFS_MOUNT_NOLOGFLUSH)) - XFS_BUF_FLUSH(bp); + if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) + XFS_BUF_ORDERED(bp); dptr = XFS_BUF_PTR(bp); /* * Bump the cycle numbers at the start of each block @@ -2951,7 +2953,7 @@ xlog_state_switch_iclogs(xlog_t *log, * not in the active nor dirty state. */ STATIC int -xlog_state_sync_all(xlog_t *log, uint flags) +xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed) { xlog_in_core_t *iclog; xfs_lsn_t lsn; @@ -3000,6 +3002,7 @@ xlog_state_sync_all(xlog_t *log, uint flags) if (xlog_state_release_iclog(log, iclog)) return XFS_ERROR(EIO); + *log_flushed = 1; s = LOG_LOCK(log); if (INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT) == lsn && iclog->ic_state != XLOG_STATE_DIRTY) @@ -3043,6 +3046,7 @@ xlog_state_sync_all(xlog_t *log, uint flags) */ if (iclog->ic_state & XLOG_STATE_IOERROR) return XFS_ERROR(EIO); + *log_flushed = 1; } else { @@ -3068,7 +3072,8 @@ xlog_state_sync_all(xlog_t *log, uint flags) int xlog_state_sync(xlog_t *log, xfs_lsn_t lsn, - uint flags) + uint flags, + int *log_flushed) { xlog_in_core_t *iclog; int already_slept = 0; @@ -3120,6 +3125,7 @@ xlog_state_sync(xlog_t *log, XFS_STATS_INC(xs_log_force_sleep); sv_wait(&iclog->ic_prev->ic_writesema, PSWP, &log->l_icloglock, s); + *log_flushed = 1; already_slept = 1; goto try_again; } else { @@ -3128,6 +3134,7 @@ xlog_state_sync(xlog_t *log, LOG_UNLOCK(log, s); if (xlog_state_release_iclog(log, iclog)) return XFS_ERROR(EIO); + *log_flushed = 1; s = LOG_LOCK(log); } } @@ -3152,6 +3159,7 @@ xlog_state_sync(xlog_t *log, */ if (iclog->ic_state & XLOG_STATE_IOERROR) return XFS_ERROR(EIO); + *log_flushed = 1; } else { /* just return */ LOG_UNLOCK(log, s); } @@ -3606,6 +3614,7 @@ xfs_log_force_umount( xlog_ticket_t *tic; xlog_t *log; int retval; + int dummy; SPLDECL(s); SPLDECL(s2); @@ -3684,7 +3693,7 @@ xfs_log_force_umount( * Force the incore logs to disk before shutting the * log down completely. */ - xlog_state_sync_all(log, XFS_LOG_FORCE|XFS_LOG_SYNC); + xlog_state_sync_all(log, XFS_LOG_FORCE|XFS_LOG_SYNC, &dummy); s2 = LOG_LOCK(log); retval = xlog_state_ioerror(log); LOG_UNLOCK(log, s2); diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 18961119fc65..dc920f83412d 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -174,9 +174,12 @@ xfs_lsn_t xfs_log_done(struct xfs_mount *mp, xfs_log_ticket_t ticket, void **iclog, uint flags); -int xfs_log_force(struct xfs_mount *mp, - xfs_lsn_t lsn, - uint flags); +int _xfs_log_force(struct xfs_mount *mp, + xfs_lsn_t lsn, + uint flags, + int *log_forced); +#define xfs_log_force(mp, lsn, flags) \ + _xfs_log_force(mp, lsn, flags, NULL); int xfs_log_mount(struct xfs_mount *mp, struct xfs_buftarg *log_target, xfs_daddr_t start_block, diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 5affba38a577..bc55931ac74e 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -415,7 +415,7 @@ typedef struct xfs_mount { * 32 bits in size */ #define XFS_MOUNT_32BITINOOPT 0x00008000 /* saved mount option state */ #define XFS_MOUNT_NOUUID 0x00010000 /* ignore uuid during mount */ -#define XFS_MOUNT_NOLOGFLUSH 0x00020000 +#define XFS_MOUNT_BARRIER 0x00020000 #define XFS_MOUNT_IDELETE 0x00040000 /* delete empty inode clusters*/ #define XFS_MOUNT_SWALLOC 0x00080000 /* turn on stripe width * allocation */ @@ -542,6 +542,7 @@ extern xfs_mount_t *xfs_mount_init(void); extern void xfs_mod_sb(xfs_trans_t *, __int64_t); extern void xfs_mount_free(xfs_mount_t *mp, int remove_bhv); extern int xfs_mountfs(struct vfs *, xfs_mount_t *mp, int); +extern void xfs_mountfs_check_barriers(xfs_mount_t *mp); extern int xfs_unmountfs(xfs_mount_t *, struct cred *); extern void xfs_unmountfs_close(xfs_mount_t *, struct cred *); diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 92efe272b83d..5e33891b8049 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -661,10 +661,11 @@ xfs_trans_unreserve_and_mod_sb( */ /*ARGSUSED*/ int -xfs_trans_commit( +_xfs_trans_commit( xfs_trans_t *tp, uint flags, - xfs_lsn_t *commit_lsn_p) + xfs_lsn_t *commit_lsn_p, + int *log_flushed) { xfs_log_iovec_t *log_vector; int nvec; @@ -893,9 +894,11 @@ xfs_trans_commit( * log out now and wait for it. */ if (sync) { - if (!error) - error = xfs_log_force(mp, commit_lsn, - XFS_LOG_FORCE | XFS_LOG_SYNC); + if (!error) { + error = _xfs_log_force(mp, commit_lsn, + XFS_LOG_FORCE | XFS_LOG_SYNC, + log_flushed); + } XFS_STATS_INC(xs_trans_sync); } else { XFS_STATS_INC(xs_trans_async); diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index a263aec8b3a6..0cc7af5c1f00 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -1025,7 +1025,12 @@ void xfs_trans_log_efd_extent(xfs_trans_t *, struct xfs_efd_log_item *, xfs_fsblock_t, xfs_extlen_t); -int xfs_trans_commit(xfs_trans_t *, uint flags, xfs_lsn_t *); +int _xfs_trans_commit(xfs_trans_t *, + uint flags, + xfs_lsn_t *, + int *); +#define xfs_trans_commit(tp, flags, lsn) \ + _xfs_trans_commit(tp, flags, lsn, NULL) void xfs_trans_cancel(xfs_trans_t *, int); void xfs_trans_ail_init(struct xfs_mount *); xfs_lsn_t xfs_trans_push_ail(struct xfs_mount *, xfs_lsn_t); diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c index f1a904e23ade..8238c7517822 100644 --- a/fs/xfs/xfs_vfsops.c +++ b/fs/xfs/xfs_vfsops.c @@ -321,8 +321,8 @@ xfs_start_flags( if (ap->flags & XFSMNT_NOUUID) mp->m_flags |= XFS_MOUNT_NOUUID; - if (ap->flags & XFSMNT_NOLOGFLUSH) - mp->m_flags |= XFS_MOUNT_NOLOGFLUSH; + if (ap->flags & XFSMNT_BARRIER) + mp->m_flags |= XFS_MOUNT_BARRIER; return 0; } @@ -512,8 +512,14 @@ xfs_mount( goto error2; error = XFS_IOINIT(vfsp, args, flags); - if (!error) - return 0; + if (error) + goto error2; + + if ((args->flags & XFSMNT_BARRIER) && + !(XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY)) + xfs_mountfs_check_barriers(mp); + return 0; + error2: if (mp->m_sb_bp) xfs_freesb(mp); @@ -656,19 +662,24 @@ xfs_mntupdate( else mp->m_flags &= ~XFS_MOUNT_NOATIME; - if (!(vfsp->vfs_flag & VFS_RDONLY)) { - VFS_SYNC(vfsp, SYNC_FSDATA|SYNC_BDFLUSH|SYNC_ATTR, NULL, error); + if ((vfsp->vfs_flag & VFS_RDONLY) && + !(*flags & MS_RDONLY)) { + vfsp->vfs_flag &= ~VFS_RDONLY; + + if (args->flags & XFSMNT_BARRIER) + xfs_mountfs_check_barriers(mp); } - if (*flags & MS_RDONLY) { + if (!(vfsp->vfs_flag & VFS_RDONLY) && + (*flags & MS_RDONLY)) { + VFS_SYNC(vfsp, SYNC_FSDATA|SYNC_BDFLUSH|SYNC_ATTR, NULL, error); + xfs_quiesce_fs(mp); /* Ok now write out an unmount record */ xfs_log_unmount_write(mp); xfs_unmountfs_writesb(mp); vfsp->vfs_flag |= VFS_RDONLY; - } else { - vfsp->vfs_flag &= ~VFS_RDONLY; } return 0; @@ -1628,7 +1639,8 @@ xfs_vget( #define MNTOPT_ALLOCSIZE "allocsize" /* preferred allocation size */ #define MNTOPT_IHASHSIZE "ihashsize" /* size of inode hash table */ #define MNTOPT_NORECOVERY "norecovery" /* don't run XFS recovery */ -#define MNTOPT_NOLOGFLUSH "nologflush" /* don't hard flush on log writes */ +#define MNTOPT_BARRIER "barrier" /* use writer barriers for log write and + unwritten extent conversion */ #define MNTOPT_OSYNCISOSYNC "osyncisosync" /* o_sync is REALLY o_sync */ #define MNTOPT_64BITINODE "inode64" /* inodes can be allocated anywhere */ #define MNTOPT_IKEEP "ikeep" /* do not free empty inode clusters */ @@ -1791,8 +1803,8 @@ xfs_parseargs( #endif } else if (!strcmp(this_char, MNTOPT_NOUUID)) { args->flags |= XFSMNT_NOUUID; - } else if (!strcmp(this_char, MNTOPT_NOLOGFLUSH)) { - args->flags |= XFSMNT_NOLOGFLUSH; + } else if (!strcmp(this_char, MNTOPT_BARRIER)) { + args->flags |= XFSMNT_BARRIER; } else if (!strcmp(this_char, MNTOPT_IKEEP)) { args->flags &= ~XFSMNT_IDELETE; } else if (!strcmp(this_char, MNTOPT_NOIKEEP)) { @@ -1866,7 +1878,7 @@ xfs_showargs( { XFS_MOUNT_NOUUID, "," MNTOPT_NOUUID }, { XFS_MOUNT_NORECOVERY, "," MNTOPT_NORECOVERY }, { XFS_MOUNT_OSYNCISOSYNC, "," MNTOPT_OSYNCISOSYNC }, - { XFS_MOUNT_NOLOGFLUSH, "," MNTOPT_NOLOGFLUSH }, + { XFS_MOUNT_BARRIER, "," MNTOPT_BARRIER }, { XFS_MOUNT_IDELETE, "," MNTOPT_NOIKEEP }, { 0, NULL } }; diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 58bfe629b933..e2bf2ef58b66 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -1118,6 +1118,7 @@ xfs_fsync( xfs_inode_t *ip; xfs_trans_t *tp; int error; + int log_flushed = 0, changed = 1; vn_trace_entry(BHV_TO_VNODE(bdp), __FUNCTION__, (inst_t *)__return_address); @@ -1171,10 +1172,18 @@ xfs_fsync( xfs_iunlock(ip, XFS_ILOCK_SHARED); if (xfs_ipincount(ip)) { - xfs_log_force(ip->i_mount, (xfs_lsn_t)0, + _xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE | ((flag & FSYNC_WAIT) - ? XFS_LOG_SYNC : 0)); + ? XFS_LOG_SYNC : 0), + &log_flushed); + } else { + /* + * If the inode is not pinned and nothing + * has changed we don't need to flush the + * cache. + */ + changed = 0; } error = 0; } else { @@ -1210,10 +1219,27 @@ xfs_fsync( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); if (flag & FSYNC_WAIT) xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, 0, NULL); + error = _xfs_trans_commit(tp, 0, NULL, &log_flushed); xfs_iunlock(ip, XFS_ILOCK_EXCL); } + + if ((ip->i_mount->m_flags & XFS_MOUNT_BARRIER) && changed) { + /* + * If the log write didn't issue an ordered tag we need + * to flush the disk cache for the data device now. + */ + if (!log_flushed) + xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp); + + /* + * If this inode is on the RT dev we need to flush that + * cache aswell. + */ + if (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) + xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp); + } + return error; }